# _*_ coding: utf-8 _*_ | |
import sys | |
reload(sys) | |
sys.setdefaultencoding("utf8") | |
from gevent import monkey | |
monkey.patch_all() | |
import requests | |
import redis | |
import gevent | |
from gevent.pool import Pool | |
from bs4 import BeautifulSoup | |
import time | |
from pymongo import MongoClient, ReadPreference | |
import json | |
import redis.connection | |
redis.connection.socket = gevent.socket | |
mongo_connection = MongoClient( | |
'%s:%d' % ( | |
JobProjectConfiguration.save_mongo_host, | |
JobProjectConfiguration.save_mongo_port), | |
read_preference=ReadPreference.SECONDARY, | |
max_pool_size=10, use_greenlets=True) | |
mongo_db = mongo_connection.jobdigg | |
redis_connection = redis.ConnectionPool( | |
host=JobProjectConfiguration.url_queue_redis_host, | |
port=JobProjectConfiguration.url_queue_redis_port, | |
db=JobProjectConfiguration.url_queue_redis_db | |
) | |
redis_proxy_pool = redis.ConnectionPool( | |
host=JobProjectConfiguration.proxy_queue_redis_host, | |
port=JobProjectConfiguration.proxy_queue_redis_port, | |
db=JobProjectConfiguration.proxy_queue_redis_db | |
) | |
proxy_pool = [] | |
pool_num = 100 | |
header = { | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Accept-Encoding": "gzip,deflate,sdch", | |
"Accept-Language": "zh-CN,zh;q=0.8", | |
"Cache-Control": "max-age=0", | |
"Connection": "keep-alive", | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36" | |
} | |
def WYUrlGenerator(): | |
print '51 Dig start : the url...' | |
start = time.time() | |
redis_db = redis.Redis(connection_pool=redis_connection) | |
urllist = WYJobUrlYield() | |
gpool = Pool(pool_num) | |
for uargs in urllist: | |
gpool.spawn(GenerateUrl, uargs) | |
gpool.join() | |
# 从这里开始,循环的从错误url集合里面取url,直至取完所有的 | |
length = redis_db.scard("error_url_list") | |
while length > 0: | |
errorlist = ErrorUrlGenerator() | |
epool = Pool(pool_num) | |
for url in errorlist: | |
epool.spawn(GenerateUrl, url) | |
epool.join() | |
length = redis_db.scard("error_url_list") | |
end = time.time() | |
print 'dig end : the url...all spend time is %0.2f' % (end - start) | |
def WYJobUrlYield(): | |
for page in xrange(3000): | |
page += 1 | |
url = "http://some.crawl.url with page num %s" % page | |
jobitem = { | |
"url": url, | |
"type": "jobtype" | |
} | |
jobvalue = json.dumps(jobitem) | |
yield jobvalue | |
#从错误url的集合里面取出url 再次处理 | |
def ErrorUrlGenerator(): | |
redis_db = redis.Redis(connection_pool=redis_connection) | |
urllist = redis_db.smembers("error_url_list") | |
for url in urllist: | |
yield url | |
def GenerateUrl(sourcejob): | |
redis_db = redis.StrictRedis(connection_pool=redis_connection) | |
pipe = redis_db.pipeline() | |
newitem = json.loads(sourcejob) | |
url = newitem["url"] | |
urltype = newitem["type"] | |
try: | |
ip = proxy_pool.getProxy() | |
proxy = {"http": "http://"+ip["proxy"]} | |
timeout = gevent.Timeout(5, ConnectionError)#在这里设置超时,保证每次请求不会太长,阻塞后面的请求,超过5秒抛出错误 | |
timeout.start() | |
r = requests.get(url, headers=header, proxies=proxy) | |
jobs = BeautifulSoup(r.text) | |
if urltype == "urltype": #获取页面的所有url,然后保存到redis的一个set里面 | |
results = jobs.findAll("a", {"class": "classname"}) | |
for result in results: | |
url = result["href"] | |
urlitem = { | |
"url": url, | |
"type": "urltype" | |
} | |
urlvalue = json.dumps(urlitem) | |
pipe.sadd("url_list", urlitem) # 这里将获取的url保存至url_list 这个redis集合里面 | |
pipe.srem("error_url_list", sourcejob) #运行到这里,说明当前url如果是错误的url,那么就已经被处理,在这里删除掉 | |
pipe.execute() | |
except Exception as e: | |
error_name = e.__class__.__name__ | |
if error_name == "ConnectionError" or error_name == "ProxyError": #通过判断错误类型(因为一些链接或者代理错误,我会直接扔回专门保存错误url的集合里面,然后下次再去取出来处理) | |
redis_db.sadd('error_url_list', sourcejob) | |
#现在我面临最恼火的问题就是其它比较正常,就在这里,当程序开启的时候,偶尔会出现sadd抛出异常 | |
#因为这里是出了异常才在这里处理错误的url的(将它保存到error_url_list,供后来的再处理),现在添加的时候出了异常, | |
#这样就会让一大部分错误的url无法保存到对应的数据库,最后导致爬到的数据太少, | |
#异常信息大致为: | |
# ConnectionError | |
# <timer at 0x36c8c80 callback=<bound method Greenlet.throw of <Greenlet at 0xc844050>> args=(<class 'requests.exceptions.ConnectionError'>,)> failed with ConnectionError | |
# Traceback (most recent call last): | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/greenlet.py", line 327, in run | |
# result = self._run(*self.args, **self.kwargs) | |
# File "61.py", line 147, in GenerateUrl | |
# redis_db.sadd('error_url_list', sourcejob) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 1248, in sadd | |
# return self.execute_command('SADD', name, *values) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 461, in execute_command | |
# return self.parse_response(connection, command_name, **options) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/client.py", line 471, in parse_response | |
# response = connection.read_response() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 339, in read_response | |
# response = self._parser.read_response() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 110, in read_response | |
# response = self.read() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/redis/connection.py", line 103, in read | |
# return self._fp.readline()[:-2] | |
# File "/usr/local/lib/python2.7/socket.py", line 447, in readline | |
# data = self._sock.recv(self._rbufsize) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 392, in recv | |
# self._wait(self._read_event) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/socket.py", line 298, in _wait | |
# self.hub.wait(watcher) | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 341, in wait | |
# result = waiter.get() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 568, in get | |
# return self.hub.switch() | |
# File "/data/home/zp/ZP_VIRENV02/lib/python2.7/site-packages/gevent/hub.py", line 331, in switch | |
# return greenlet.switch(self) | |
if __name__ == '__main__': | |
st = time.time() | |
time.sleep(5) | |
WYUrlGenerator() | |
et = time.time() | |
print "**************end****************,the spend time is %0.2f" % (et - st) |
1
jander 2014-06-05 11:34:16 +08:00
应该加上
from gevent import monkey; monkey.patch_socket() |
![]() |
2
penkchow OP |
3
jander 2014-06-05 12:10:45 +08:00
哦,没看仔细。
redis连接异常。你的代码使用redis.ConnectionPool, 其实redis可以直接连,内部已经使用pool实现: redis.StrictRedis(host='localhost', port=6379, db=0) 你可以直接连试试。 |
4
jsonline 2014-06-05 12:11:47 +08:00
每个月都有一个人来问爬虫的问题。
|
![]() |
10
penkchow OP Okay,试试。
|