url1 = 'http://m.taduo.net' url2 = '/manhua/2/' resp = requests.get(url1+url2,headers=headers) doc = pq(resp.content[1000:]) # pyquery has a max length ? #lst = doc('li>a[href^=%s]' % url2.replace('/', '\/')) lst = doc('ul>li>a[href^=%s]' % url2.replace('/', '\/')) urls = [] for l in lst.items(): urls.insert(0, url1+l.attr('href')) #print(l.attr('href')) #print(urls[:10])
defgetIndex(c): d = ord(c) if d<60: return d-48# digit else: return d-87# alpha
defdecode(data64): data = base64.b64decode(data64).decode('utf-8').split('=[\\\'')[1].split('\'.split')[0].split('\\\']\',') encoded = data[0].split('\\\',\\\'') dictionary = data[1].split('\'')[1].split('|') result = [] for e in encoded: enc = e.split() res = '' for ec in enc: for c in ec: if c=='': continue if c=='/'or c=='.': res += c else: res += dictionary[getIndex(c)] result.append(res) return result
chapter = 1# start from 1 start = chapter-1# start index #for url in urls: for url in urls[start:]: print('Doing: %d | %s' % (chapter,url)) resp = requests.get(url,headers=headers) doc = pq(resp.content) data64 = str(resp.content).split('cp="')[1].split('";')[0] pages = decode(data64) #print(pages)
# save targetDir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'asset/'+str(chapter)) ifnot os.path.isdir(targetDir): os.mkdir(targetDir) # get for p inrange(len(pages)): purl = 'http://mh.jiduo.cc/' + pages[p] print(' - getting -> book:'+str(chapter)+' - page:'+str(p).rjust(4,'0')) tc = 0 whileTrue: try: resp = requests.get(purl,headers=headers,timeout=10) break except: print('- timeout: %d' % tc) tc += 1 if tc>10: print('- network error -') exit()