realcodespider

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42


import requests
import re
from bs4 import BeautifulSoup
class :
def __init__(self):
pass

def deals (self, html):

soup = BeautifulSoup(html,'html.parser');
#print (soup.a.select('.aw-user-name'))
print (soup.prettify());
for i in soup.find_all('div',class_ = "mod-head"):
try :
if i.div.p.a.get_text() == "钟俊威":
print(i.div.p.a.get_text() + i.find_next_sibling().div.get_text())
except:
pass
def load(self, url):
print(url);
headers = {
"Connection":"keep-alive",
"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0"
}
#request = requests.Request(url=url,headers=headers);

try:
response = requests.get(url,headers = headers)
html = response.text;
except:
print("No");
self.deals(html);
def main(self):
url = "https://www.ctguqmx.com/question/"
for i in range(300 , 400):
self.load("https://www.ctguqmx.com")
self.load(url + str(i));
if __name__ == '__main__':
spider = Spider();
spider.main();