|
本帖最后由 流量之神 于 2018-6-4 01:08 编辑
水平还是很菜的,哈哈,不过爬虫跑起来了挺有意思,找了一个简单的网站练手http://www.mdyuepai.com/
[ol]import reimport requestsimport jsonfrom multiprocessing import Poolfrom requests.exceptions import RequestExceptionimport osfrom hashlib import md5from multiprocessing import Pooldef get_page_index(offset): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', } url = 'http://www.mdyuepai.com/?page=' + str(offset) try: response = requests.get(url, headers = headers) if response.status_code == 200: return response.text return None except RequestException: print('请求索引页出错') return Nonedef parse_index_page(html): pattern = re.compile('', re.S) items = re.findall(pattern, html) for item in items: yield itemdef get_page_detail(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except ConnectionError: print('Error occurred') return Nonedef save_image(content): file_path = '{0}/{1}.{2}'.format('/home/', md5(content).hexdigest(), 'jpg') print(file_path) if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close()def download_image(url): print('Downloading', url) try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except ConnectionError: return Nonedef parse_page_detail(html): pattern = re.compile('class="item_infor_img.*?src="(.*?)".*?', re.S) images = re.findall(pattern, html) for image in images: download_image(image)def main(): for offset in range(20): html = get_page_index(offset) for item in parse_index_page(html): url = 'http://www.mdyuepai.com/'+ item html2 = get_page_detail(url) images = parse_page_detail(html2)if __name__ == '__main__': pool = Pool() pool.map(main())[/ol]复制代码 |
|