python写爬虫5-多进程爬虫(采集58出租房信息)

2/22/2017来源:ASP.NET技巧人气:1330

python写爬虫5-多进程爬虫(采集58出租房信息)

本文代码是在【python写爬虫4-多线程爬虫(采集58出租房信息)】http://blog.csdn.net/apple9005/article/details/54971151博文的基础上编写

多进程爬虫

#! /usr/bin/env python # -*- coding:utf-8 -*- import urllib2 import lxml.html import time from lxml.CSSselect import CSSSelector from MongoCache import MongoCache import threading import multiprocessing def download(url, user_agent='Google', num_retries=2): """下载整个页面""" PRint 'Downloading:', url # 设置用户代理 headers = {'User-agent': user_agent} request = urllib2.Request(url, headers=headers) try: html = urllib2.urlopen(request).read() except urllib2.URLError as e: print 'Downloading error:', e.reason html = None # 只有在服务器报500-600错误时,才会重试下载,仅重试2次 if num_retries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: return download(url, num_retries-1) return html def get_data(url, process_name): """从详细页面 获取各字段数据""" print '------Process Name: %s-----------Thread Name: %s-------' % (process_name, threading.current_thread().getName()) # 如果缓存中有该页面数据,则直接获取使用;否则,先下载页面,再使用 cache = MongoCache() if not cache.__getitem__(url): html_text_detail = download(url) if not html_text_detail: print 'None:', url else: cache.__setitem__(url, html_text_detail) else: print 'Exists:', url html_text_detail = cache.__getitem__(url) try: # 获取个字段数据 tree = lxml.html.fromstring(html_text_detail) house_title = CSSSelector('div.main-wrap > div.house-title > h1') house_pay_way1 = CSSSelector('div.house-pay-way > span:nth-child(1)') house_pay_way2 = CSSSelector('div.house-pay-way > span:nth-child(2)') print house_title(tree)[0].text_content() print '%s|%s' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content()) for i in range(7): for j in range(2): css = 'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)' % (i+1, j+1) house_info = CSSSelector(css) print house_info(tree)[0].text_content().replace(' ', '') except TypeError as e: print 'HTML文本发生错误:%s' % e except IndexError as e: print '获取详细数据发生错误:%s' % e def get_url(html): """获取需爬取数据的链接集""" tree = lxml.html.fromstring(html) sel = CSSSelector('div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a') url_list = [] for i in sel(tree): if i.get('href') not in url_list: url_list.append(i.get('href')) return url_list def create_thread(url_list, process_name): """开启多个线程,最多4个,当少于4个时,会再开启另一个线程,直至全部url被采集""" while True: if threading.active_count() >= 4: time.sleep(1) else: lock.acquire() # 获取锁 if len(url_list) > 0: thr = threading.Thread(target=get_data, args=(url_list.pop(), process_name)) lock.release() # 释放锁 thr.start() else: lock.release() # 释放锁 break if __name__ == '__main__': url_index = 'http://bj.58.com/chuzu/' html_text_list = download(url_index) global url_list # 需采集的url列表 lock = threading.Lock() url_list = get_url(html_text_list) processes = [] for i in range(2): p = multiprocessing.Process(target=create_thread, args=(url_list, i)) # 启动2进程,每个进程中最多开启4线程 p.start() processes.append(p) for p in processes: p.join()