#!/usr/bin/env python
# -*- coding:utf-8 -*-
importtimefromdatetimeimporttimedeltafromtornadoimporthttpclient,gen,ioloop,queuesimporttracebackclassAsySpider(object):"""A simple class of asynchronous spider."""def__init__(self,urls,concurrency=10,results=None,**kwargs):urls.reverse()self.urls=urlsself.concurrency=concurrencyself._q=queues.Queue()self._fetching=set()self._fetched=set()ifresultsisNone:self.results=[]deffetch(self,url,**kwargs):fetch=getattr(httpclient.AsyncHTTPClient(),'fetch')returnfetch(url,raise_error=False,**kwargs)defhandle_html(self,url,html):"""handle html page"""print(url)defhandle_response(self,url,response):"""inherit and rewrite this method if necessary"""ifresponse.code==200:self.handle_html(url,response.body)elifresponse.code==599:# retry
self._fetching.remove(url)self._q.put(url)@gen.coroutinedefget_page(self,url):try:response=yieldself.fetch(url)#print('######fetched %s' % url)
exceptExceptionase:print('Exception: %s %s'%(e,url))raisegen.Return(e)raisegen.Return(response)@gen.coroutinedef_run(self):@gen.coroutinedeffetch_url():current_url=yieldself._q.get()try:ifcurrent_urlinself._fetching:return#print('fetching****** %s' % current_url)
self._fetching.add(current_url)response=yieldself.get_page(current_url)self.handle_response(current_url,response)# handle reponse
self._fetched.add(current_url)foriinrange(self.concurrency):ifself.urls:yieldself._q.put(self.urls.pop())finally:self._q.task_done()@gen.coroutinedefworker():whileTrue:yieldfetch_url()self._q.put(self.urls.pop())# add first url
# Start workers, then wait for the work queue to be empty.
for_inrange(self.concurrency):worker()yieldself._q.join(timeout=timedelta(seconds=300000))try:assertself._fetching==self._fetchedexceptAssertionError:print(self._fetching-self._fetched)print(self._fetched-self._fetching)defrun(self):io_loop=ioloop.IOLoop.current()io_loop.run_sync(self._run)classMySpider(AsySpider):deffetch(self,url,**kwargs):"""重写父类fetch方法可以添加cookies,headers,timeout等信息"""cookies_str="PHPSESSID=j1tt66a829idnms56ppb70jri4; pspt=%7B%22id%22%3A%2233153%22%2C%22pswd%22%3A%228835d2c1351d221b4ab016fbf9e8253f%22%2C%22_code%22%3A%22f779dcd011f4e2581c716d1e1b945861%22%7D; key=%E9%87%8D%E5%BA%86%E5%95%84%E6%9C%A8%E9%B8%9F%E7%BD%91%E7%BB%9C%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8; think_language=zh-cn; SERVERID=a66d7d08fa1c8b2e37dbdc6ffff82d9e|1444973193|1444967835; CNZZDATA1254842228=1433864393-1442810831-%7C1444972138"headers={'User-Agent':'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)','cookie':cookies_str}returnsuper(MySpider,self).fetch(url,headers=headers)defhandle_html(self,url,html):#print(url, html)
print(url)defmain():urls=[]forpageinrange(1,10000):urls.append('http://www.baidu.com/?page=%s'%page)s=MySpider(urls)s.run()if__name__=='__main__':main()
近期评论