building python crawler under scrapy framework

“static crawler” crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib3
import urllib.request
import requests
import re
import queue
from bs4 import BeautifulSoup
import scrapy
'''
browser = webdriver.PhantomJS()
browser.get('http://www.aae.wisc.edu/')
print(browser.title)
'''
faculty_name = []()
faculty={}
driver = webdriver.PhantomJS()
driver.get("http://www.cs.wisc.edu/people/faculty/")
soup = BeautifulSoup(driver.page_source,"html.parser")
soup.prettify()
a=0
for item in soup.find_all(class_="views-field views-field-field-full-name"):
print(item.contents[1]().string)
a=a+1
print(a)
#print(faculty_name)
print("************************")
b=0
for item in soup.find_all(class_="views-field views-field-field-title"):
print(item.contents[1]().string)
b=b+1
#print("--------")
#print(faculty_name)
print(b)
c=0
personal_url=[]()
for item in soup.find_all(class_="views-field views-field-field-full-name"):
print("www.cs.wisc.edu"+item.a['href']())
personal_url.append("http://www.cs.wisc.edu"+item.a['href']())
c=c+1
#print("--------")
#print(faculty_name)
print(c)
i=0
while i<1:
print(personal_url[i]())
response = urllib.request.urlopen(personal_url[i]())
content = response.read()
soup = BeautifulSoup(content, "html.parser")
#print(soup.prettify())
for item in soup.find_all(re.compile("^Personal Website:")):
print(item.a)
i=i+1

This is the code I used to crawl some static web page and parse the HTML file.