使用socket和asynico库爬取数据


#asyncio 没有提供http协议的接口 aiohttp
import asyncio
import socket
from urllib.parse import urlparse


async def (url):
    #通过socket请求html
    url = urlparse(url)
    host = url.netloc
    path = url.path
    if path == "":
        path = "/"

    #建立socket连接
    reader, writer = await asyncio.open_connection(host,80)
    writer.write("GET {} HTTP/1.1rnHost:{}rnConnection:closernrn".format(path, host).encode("utf8"))
    all_lines = []
    async for raw_line in reader:
        data = raw_line.decode("utf8")
        all_lines.append(data)
    html = "n".join(all_lines)
    html_data = html.split("rnrn")
    print(len(html_data))
    return html_data

async def main():
    url = "xxx/?page={}"
    for i in range(2,20):
        task = asyncio.create_task(get_url(url.format(i)))
        await task

if name == "main":
    import time
    start_time = time.time()
    asyncio.run(main(),debug=True)
    print('last time:{}'.format(time.time()-start_time))

https://docs.python.org/zh-cn/3/library/asyncio.html

使用socket和asynico库爬取数据

近期文章

近期评论

标签

热门

文章归档

分类目录

功能