当前位置:网站首页 > 更多 > 编程开发 > 正文

[Python] 贝壳租房爬虫

作者:CC下载站 日期:2021-11-25 00:00:00 浏览:79 分类:编程开发

import urllib.request as request

from bs4 import BeautifulSoup
import os


def downloadImg(url: str, name):
    soup = BeautifulSoup(str(request.urlopen(url).read(), encoding='utf-8'), features='html.parser')
    savePath = f"./result/images/{name.replace('/', '-')}"
    if not os.path.exists(savePath):
        os.mkdir(savePath)
    for img in soup.select('div.content__article__slide__item > img'):
        img = img.get('data-src')
        with open(f"{savePath}/{img.split('/')[-1].split('!')[0].split('?')[0]}", 'wb') as f:
            f.write(request.urlopen(img).read())


def a():
    with open('./result/list.txt', 'w', encoding='utf-8') as f:
        for i in range(1, 11):
            url = f'https://cs.zu.ke.com/zufang/pg{i}/#contentList'
            data = str(request.urlopen(url).read(), encoding='utf-8')
            soup = BeautifulSoup(data, features='html.parser')
            for item in soup.find_all('div', attrs={'class': 'content__list--item'}):
                imgTag = item.find('img')
                imgUrl = imgTag.get('data-src')
                print(imgUrl)
                title = imgTag.get('alt')
                print(title)
                downloadImg(
                    f"https://cs.zu.ke.com{item.find('a', attrs={'class': 'content__list--item--aside'}).get('href')}",
                    title)
                des = item.find('p', attrs={'class': 'content__list--item--des'}) \
                    .get_text().replace('\n', '').replace(' ', '')
                print(des)
                price = item.find('span', attrs={'class': "content__list--item-price"}).get_text()
                print(price)
                tags = [tag.get_text() for tag in
                        item.find('p', attrs={'class': 'content__list--item--bottom oneline'}).select('p > i')]
                print(tags)
                f.write(f'{title},{des},{price},{tags}\n')
                print('*' * 30)


if __name__ == '__main__':
    os.mkdir('/result')
    a()

您需要 登录账户 后才能发表评论

取消回复欢迎 发表评论:

关灯