如何使用Python Scrapy高效抓取特定网站的数据？

2026-06-10 23:3111阅读0评论SEO资源

内容介绍
文章标签
相关推荐

本文共计283个文字，预计阅读时间需要2分钟。

pythonimport scrapyfrom scrapy.items import FileItemimport jsonimport mathimport datetime

class xxSpider(scrapy.Spider): name='xx' allowed_domains=['xx.com']

主要源码如下：

import scrapy
from pc.items import FileItem
import json
import math
import datetime

class xxSpider(scrapy.Spider):
name = 'xx'
allowed_domains = ['xx.com']
#offset = 1

#max_page=10 #抓取的最大页数
baseURL = 'www.xxx.com/js/piaofu.html'
start_urls = [baseURL]

def parse(self, response):
url_list = response.xpath("//div[@class='list-pngjs']/dl/dd/a/@href").extract()
for url in url_list:
href = 'www.xx.com' + url
print("href" * 30)
print(href)
print("href" * 30)
if type(href) == str:
yield scrapy.Request(
href,
callback=self.parse_detail
)

if len(response.xpath("//div[@class='dede_pages']/ul/li[@class='thisclass']/following-sibling::li")):
url = response.xpath("//div[@class='dede_pages']/ul/li[@class='thisclass']/following-sibling::li/a/@href").extract()[0]
url = 'www.xx.com' + url
print("0" * 30)
print(url)
print("0" * 30)
yield scrapy.Request(url, callback=self.parse)

def parse_detail(self,response):
pic_id = response.xpath("//div[@id='l']/div[@class='content-a']/div[@class='xiazai']/a[@class='bt-blue js-download']/@data-fileid").extract()[0]
url='www.xxx.com/js/d'+pic_id+'.zip'
# fileUrl为相对路径时，可用response.urljoin(url)进行拼接
item = FileItem(file_urls=[url])
yield item # 注意：此处为yield，不是return

下载源码

标签：Python scrapy 下载网站数据源码