本文共 2823 字,大约阅读时间需要 9 分钟。
#!C:\Python27\python.exe#coding=utf8import osimport urllibimport urllib2from bs4 import BeautifulSoupfrom multiprocessing import Poolimport socketsocket.setdefaulttimeout(60)def url_open(url): user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36' headers = {'User-Agent': user_agent} request = urllib2.Request(url=url, headers=headers) try: page = urllib2.urlopen(request, timeout=60) except urllib2.HTTPError as e: return 1 contents = page.read() # print contents soup = BeautifulSoup(contents.decode('gb2312','ignore'), "lxml") return soupdef retrieve_img(dir, link): if url_open(link) != 1: soup = url_open(link) for line in soup.find_all("div", "ContentBox"): img_list = line.find_all('img') for img_item in img_list: raw_img = img_item.get('src') img_url = 'http:' + raw_img img_name = raw_img.split('/')[-1] print "Download: %s" % img_url # print img_name urllib.urlretrieve(img_url, dir + '\\' + img_name)def crawler(root, url): # print url if url_open(url) != 1: soup = url_open(url) # print soup for line in soup.find_all("div", "box_con newslist"): li = line.find_all('li') for a in li: href = a.find_all('a') link = root + href[0].get('href') text = href[0].get_text() # print link print "Creating directory %s..." % text if not os.path.isdir(text): try: os.mkdir(text) if os.path.isdir(text): print "Directory %s successfully created!" % text print "Crawling image page %s." % link retrieve_img(text, link) except IOError, e: print e elif os.path.isdir(text): print "Directory %s already exists!" % text print "Crawling image page %s." % link retrieve_img(text, link) # break #func testdef single_func(num): root = 'http://xxx.com' url = "http://xxx.com/articlelist/?20-" + str(num) + '.html' crawler(root, url)if __name__ == '__main__': # single_func(1) #func test pool = Pool(processes=8) for i in range(1, 187): result = pool.apply_async(single_func, (i,)) pool.close() pool.join() if result.successful(): print 'Successful!'