域名爬虫 3搜索引擎

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
import re
from time import time
from multiprocessing.dummy import Pool as ThreadPool


class (object):
def __init__(self):
self.domain_list = {}
self.timeout = 20
self.domain_re = re.compile(r'[a-zA-Z0-9]*.?[a-zA-Z0-9]*.?[a-zA-Z0-9]*.?[a-zA-Z0-9]*.?'
r'[a-zA-Z0-9]*.?[a-zA-Z0-9]+.[a-zA-Z]+')
self.baidu_url_re = re.compile('class="c-showurl" style="text-decoration:none;">(.*?)</a>')
self.so_domain_re = re.compile(r'data-url="(.*)" {2}rel="noopener"')
self.bing_domain_re = re.compile(r'<cite>(.*)</cite>')

def html_get(self, url):
try:
return requests.get(url, timeout=self.timeout).text
except:
pass

def baidu_domain(self, keyword, num):
url = "http://www.baidu.com/s?wd=%s&pn=%d" % (keyword, num * 10)
data = self.html_get(url)
if data:
baidu_url = self.baidu_url_re.findall(data)
baidu_url = [line.replace("<b>", "") for line in baidu_url]
baidu_url = [line.replace("</b>", "") for line in baidu_url]
domain_list = list(map(self.domain_re.findall, baidu_url))
for domain in domain_list:
if domain:
self.domain_list[domain[0]] = None

def so_domain(self, keyword, num):
url = "http://www.so.com/s?q=%s&pn=%d" % (keyword, num)
data = self.html_get(url)
if data:
so_url = self.so_domain_re.findall(data)
domain_list = list(map(self.domain_re.findall, so_url))
for domain in domain_list:
if domain:
self.domain_list[domain[0]] = None

def bing_domain(self, keyword, num):
url = "https://www.bing.com/search?q=%s&first=%d" % (keyword, num * 10 + 1)
data = self.html_get(url)
if data:
bing_url = self.bing_domain_re.findall(data)
bing_url = [line.replace("<strong>", "") for line in bing_url]
bing_url = [line.replace("</strong>", "") for line in bing_url]
domain_list = list(map(self.domain_re.findall, bing_url))
for domain in domain_list:
if domain:
self.domain_list[domain[0]] = None


def run(arg):
keyword, num = arg
print("线程启动: 关键字[%s] 页数[%d]" % (keyword, num))
spider.bing_domain(keyword, num)
spider.so_domain(keyword, num)
spider.baidu_domain(keyword, num)


def url_list(keyword_list, numbers):
return [[keyword, num] for keyword in keyword_list for num in range(1,numbers)]


with open("keyword.txt", 'r') as rf:
keyword_list = rf.read().split("n")
arg = url_list(keyword_list, 75)
spider = Spider()
pool = ThreadPool(50)
pool.map(run, arg)
with open("./result/domain-%s.txt" % (time()), 'w') as wf:
for domain in spider.domain_list:
wf.write(domain+"n")

###使用
第一次运行新建result文件夹用来存放爬取的数据
将关键字写入到keyword.txt里 一行一个