1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
|
import requests import re from time import time from multiprocessing.dummy import Pool as ThreadPool
class (object): def __init__(self): self.domain_list = {} self.timeout = 20 self.domain_re = re.compile(r'[a-zA-Z0-9]*.?[a-zA-Z0-9]*.?[a-zA-Z0-9]*.?[a-zA-Z0-9]*.?' r'[a-zA-Z0-9]*.?[a-zA-Z0-9]+.[a-zA-Z]+') self.baidu_url_re = re.compile('class="c-showurl" style="text-decoration:none;">(.*?)</a>') self.so_domain_re = re.compile(r'data-url="(.*)" {2}rel="noopener"') self.bing_domain_re = re.compile(r'<cite>(.*)</cite>')
def html_get(self, url): try: return requests.get(url, timeout=self.timeout).text except: pass
def baidu_domain(self, keyword, num): url = "http://www.baidu.com/s?wd=%s&pn=%d" % (keyword, num * 10) data = self.html_get(url) if data: baidu_url = self.baidu_url_re.findall(data) baidu_url = [line.replace("<b>", "") for line in baidu_url] baidu_url = [line.replace("</b>", "") for line in baidu_url] domain_list = list(map(self.domain_re.findall, baidu_url)) for domain in domain_list: if domain: self.domain_list[domain[0]] = None
def so_domain(self, keyword, num): url = "http://www.so.com/s?q=%s&pn=%d" % (keyword, num) data = self.html_get(url) if data: so_url = self.so_domain_re.findall(data) domain_list = list(map(self.domain_re.findall, so_url)) for domain in domain_list: if domain: self.domain_list[domain[0]] = None
def bing_domain(self, keyword, num): url = "https://www.bing.com/search?q=%s&first=%d" % (keyword, num * 10 + 1) data = self.html_get(url) if data: bing_url = self.bing_domain_re.findall(data) bing_url = [line.replace("<strong>", "") for line in bing_url] bing_url = [line.replace("</strong>", "") for line in bing_url] domain_list = list(map(self.domain_re.findall, bing_url)) for domain in domain_list: if domain: self.domain_list[domain[0]] = None
def run(arg): keyword, num = arg print("线程启动: 关键字[%s] 页数[%d]" % (keyword, num)) spider.bing_domain(keyword, num) spider.so_domain(keyword, num) spider.baidu_domain(keyword, num)
def url_list(keyword_list, numbers): return [[keyword, num] for keyword in keyword_list for num in range(1,numbers)]
with open("keyword.txt", 'r') as rf: keyword_list = rf.read().split("n") arg = url_list(keyword_list, 75) spider = Spider() pool = ThreadPool(50) pool.map(run, arg) with open("./result/domain-%s.txt" % (time()), 'w') as wf: for domain in spider.domain_list: wf.write(domain+"n")
|
近期评论