当前位置:网站首页 > 更多 > 编程开发 > 正文

[Python] 利用Scrapy框架爬取LOL皮肤站高清壁纸

作者:CC下载站 日期:2020-03-06 00:00:00 浏览:72 分类:编程开发

成品打包:点击进入

代码:

爬虫文件

#-*-coding:utf-8-*-
importscrapy
frompractice.itemsimportPracticeItem
fromurllibimportparse


classLolskinSpider(scrapy.Spider):
name='lolskin'
allowed_domains=['lolskin.cn']
start_urls=['https://lolskin.cn/champions.html']

#获取所有英雄链接
defparse(self,response):
item=PracticeItem()
item['urls']=response.xpath('//div[2]/div[1][email protected]').extract()
forurlinitem['urls']:
self.csurl='https://lolskin.cn'
yieldscrapy.Request(url=parse.urljoin(self.csurl,url),dont_filter=True,callback=self.bizhi)
returnitem

#获取所有英雄皮肤链接
defbizhi(self,response):
skins=(response.xpath('[email protected]').extract())
forskininskins:
yieldscrapy.Request(url=parse.urljoin(self.csurl,skin),dont_filter=True,callback=self.get_bzurl)

#采集每个皮肤的壁纸,获取壁纸链接
defget_bzurl(self,response):
item=PracticeItem()
image_urls=response.xpath('//body/div[1][email protected]').extract()
image_name=response.xpath('//h1/text()').extract()
yield{
'image_urls':image_urls,
'image_name':image_name
}
returnitem

items.py

#-*-coding:utf-8-*-

#Defineherethemodelsforyourscrapeditems
#
#Seedocumentationin:
#https://docs.scrapy.org/en/latest/topics/items.html

importscrapy


classPracticeItem(scrapy.Item):
#definethefieldsforyouritemherelike:
#name=scrapy.Field()
#titles=scrapy.Field()
#yxpngs=scrapy.Field()
urls=scrapy.Field()
skin_name=scrapy.Field()#皮肤名
image_urls=scrapy.Field()#皮肤壁纸url
images=scrapy.Field()

pipelines.py

#-*-coding:utf-8-*-

#Defineyouritempipelineshere
#
#Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting
#See:https://docs.scrapy.org/en/latest/topics/item-pipeline.html
importos
importre
fromscrapy.pipelines.imagesimportImagesPipeline
importscrapy


#classPracticePipeline(object):
#def__init__(self):
#self.file=open('text.csv','a+')
#
#defprocess_item(self,item,spider):
##os.chdir('lolskin')
##fortitleinitem['titles']:
##os.makedirs(title)
#skin_name=item['skin_name']
#skin_jpg=item['skin_jpg']
#foriinrange(len(skin_name)):
#self.file.write(f'{skin_name[i]},{skin_jpg}
')
#self.file.flush()
#returnitem
#
#defdown_bizhi(self,item,spider):
#self.file.close()


classLoLPipeline(ImagesPipeline):
defget_media_requests(self,item,info):
forimage_urlinitem['image_urls']:
yieldscrapy.Request(image_url,meta={'image_name':item['image_name']})

#修改下载之后的路径以及文件名
deffile_path(self,request,response=None,info=None):
image_name=re.findall('/skin/(.*?)/',request.url)[0]+"/"+request.meta[f'image_name'][0]+'.jpg'
returnimage_name

settings.py

#-*-coding:utf-8-*-

#Scrapysettingsforpracticeproject
#
#Forsimplicity,thisfilecontainsonlysettingsconsideredimportantor
#commonlyused.Youcanfindmoresettingsconsultingthedocumentation:
#
#https://docs.scrapy.org/en/latest/topics/settings.html
#https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#https://docs.scrapy.org/en/latest/topics/spider-middleware.html
importos

BOT_NAME='practice'

SPIDER_MODULES=['practice.spiders']
NEWSPIDER_MODULE='practice.spiders'

#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent
#USER_AGENT='practice(+http://www.yourdomain.com)'

#Obeyrobots.txtrules
ROBOTSTXT_OBEY=False

#ConfiguremaximumconcurrentrequestsperformedbyScrapy(default:16)
#CONCURRENT_REQUESTS=32

#Configureadelayforrequestsforthesamewebsite(default:0)
#Seehttps://docs.scrapy.org/en/latest/topics/settings.html#download-delay
#Seealsoautothrottlesettingsanddocs
#设置延时
DOWNLOAD_DELAY=1
#Thedownloaddelaysettingwillhonoronlyoneof:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16

#Disablecookies(enabledbydefault)
#COOKIES_ENABLED=False

#DisableTelnetConsole(enabledbydefault)
#TELNETCONSOLE_ENABLED=False

#Overridethedefaultrequestheaders:
#DEFAULT_REQUEST_HEADERS={
#'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#'Accept-Language':'en',
#}

#Enableordisablespidermiddlewares
#Seehttps://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES={
#'practice.middlewares.PracticeSpiderMiddleware':543,
#}

#Enableordisabledownloadermiddlewares
#Seehttps://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES={
#'practice.middlewares.PracticeDownloaderMiddleware':543,
#}

#Enableordisableextensions
#Seehttps://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS={
#'scrapy.extensions.telnet.TelnetConsole':None,
#}

#Configureitempipelines
#Seehttps://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES={
#'practice.pipelines.PracticePipeline':300,
#'scrapy.pipelines.images.ImagesPipeline':1,
'practice.pipelines.LoLPipeline':1
}
#设置采集文件夹路径
IMAGES_STORE='E:PythonscrapypracticepracticeLOLskin'
#EnableandconfiguretheAutoThrottleextension(disabledbydefault)
#Seehttps://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED=True
#Theinitialdownloaddelay
#AUTOTHROTTLE_START_DELAY=5
#Themaximumdownloaddelaytobesetincaseofhighlatencies
#AUTOTHROTTLE_MAX_DELAY=60
#TheaveragenumberofrequestsScrapyshouldbesendinginparallelto
#eachremoteserver
#AUTOTHROTTLE_TARGET_CONCURRENCY=1.0
#Enableshowingthrottlingstatsforeveryresponsereceived:
#AUTOTHROTTLE_DEBUG=False

#EnableandconfigureHTTPcaching(disabledbydefault)
#Seehttps://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'

main.py

fromscrapy.cmdlineimportexecute

execute(['scrapy','crawl','lolskin'])


您需要 登录账户 后才能发表评论

取消回复欢迎 发表评论:

关灯