my first web crawler

Target

Get all the titles and urls of my blogs.

Design

这里写图片描述

Firstly, the Crawler needs one controller(WebCrawler) to scheme the itself.The controller sends the root url to the urlManager.Then judges whether there is any url in the set.If the set is not empty, Crawler pops one from the set.Crawler downloads the web page of the given url, and sends the content to the parser.The parser returns what we what.If the set is not null, Crawler do what we have done before again.

urlManager gets all list page by root url and add them to new_urls(set).If the controller pop one url, urlManager move the url from new_urls(set) to old_urls(set).Also urlManager has methods of has_new_url and get_new_url.

htmlDownLoader download the web page of the given url.

htmlParser help us to grab useful infomation.We should tell where the infomation is(locate by tabel and the label’s id, class and other attributes).At last the parser returns what we want on the page we given.

Finally, the Crawler needs to present the result.Everytime when parser returns any datum, htmlOutputer adds them to its list.When all the urls in the set are used, htmlOutputer prints all the datum in one html file.We can open the file in Firefox and check the result.

Structure

这里写图片描述

Code

WebCrawler.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import htmlDownLoader
import htmlOutputer
import htmlParser
import urlManager
class (object):
def __init__(self):
self.urls = urlManager.UrlManager()
self.downloader = htmlDownLoader.HtmlDownloader()
self.parser = htmlParser.HtmlParser()
self.outputer = htmlOutputer.HtmlOutputer()
def craw(self, root_url):
count = 1
self.urls.add_new_urls(root_url)
while self.urls.has_new_url():
try:
new_url = self.urls.get_new_url()
print "craw %d : %s" % (count, new_url)
html_cont = self.downloader.download(new_url)
new_datum = self.parser.parse(new_url, html_cont)
self.outputer.collect_data(new_datum)
count += 1
except:
print "craw failed : %s" % (new_url)
self.outputer.output_html()
if __name__ == "__main__":
root_url = "http://blog.callouweicheng.cn/"
obj_spider = Controller()
obj_spider.craw(root_url)

urlManager.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from bs4 import BeautifulSoup
import htmlDownLoader
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
self.downloader = htmlDownLoader.HtmlDownloader()
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
def add_new_urls(self, root_url):
if root_url is None:
return
html_cont = self.downloader.download(root_url)
if html_cont is None:
return
soup = BeautifulSoup(html_cont, "html.parser", from_encoding="utf-8")
# <a class ="page-number" href="/page/2/"> 2 </ a>
nums = soup.find_all("a", class_="page-number")
for num in nums:
endNum = num.get_text()
self.new_urls.add(root_url)
for i in range(2, int(endNum)+1):
self.new_urls.add("http://tonyzhang94.github.io/page/"+str(i))

htmlDownLoader.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import urllib2
class HtmlDownloader(object):
def download(self, url):
if url is None:
return None
response = urllib2.urlopen(url)
if response.getcode() != 200:
return None
return response.read()

htmlParser.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from bs4 import BeautifulSoup
class HtmlParser(object):
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, "html.parser", from_encoding = "utf-8")
new_data = self._get_new_data(page_url, soup)
return new_data
def _get_new_data(self, page_url, soup):
res_data = list()
# <a class="post-title-link" href="/2016/05/17/Generating-function-Implements:Credits-Combination/" itemprop="url">
# Generating Function Implements:Credits Combination
# </a>
titles = soup.find_all("a", class_="post-title-link")
for title in titles:
data = {}
data["url"] = "http://tonyzhang94.github.iotitle" + title["href"]
data["title"] = title.get_text()
res_data.append(data)
return res_data

htmlOutputer.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
class HtmlOutputer(object):
def __init__(self):
self.datas = []
def output_html(self):
fout = open("output.html", "w")
fout.write("<html>")
fout.write("<body>")
fout.write("<table>")
for data in self.datas:
fout.write("<tr>")
fout.write("<td>%s</td>" % data["url"].encode("utf-8"))
fout.write("<td>%s</td>" % data["title"].encode("utf-8"))
fout.write("</tr>")
fout.write("</table>")
fout.write("</body>")
fout.write("</html>")
def collect_data(self, datum):
if datum is None:
return
for data in datum:
self.datas.append(data)

Output

Run WebCrawler in IDE(here is pycharm)
这里写图片描述

Open output.html in Firefox
这里写图片描述