如何用Python编写爬虫程序,专门抓取中证网上的银行数据?
- 内容介绍
- 文章标签
- 相关推荐
本文共计1651个文字,预计阅读时间需要7分钟。
python最终版:07_证书网(Plus+ - Pro).py
最终版:07_中证网(Plus -Pro).py# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
import os
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码
for qq in range(8):
# query = input("请输入你想搜索的内容:")
query = '苏州银行'
#年份
year = [2014,2015,2016,2017,2018,2019,2020,2021]
#总页数
pages = [2,1,1,1,11,1,19,7]
year = year[qq]
pages = pages[qq]
if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}'): # 如果没有此文件夹
os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}') # 创建此文件夹
m = 0
for p in range(1, pages + 1):
url = f'search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find_all("table")
datalist = []
for ii in alist:
ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #333333;margin-top: 4px;')
# print('ss=\n\n',ss)
if ss != None:
ss = ss.get_text()
datalist.append(ss)
# print('data:',datalist,len(datalist))
if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}'): # 如果没有此文件夹
os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}') # 创建此文件夹
for ii in range(len(datalist)):
fp = open(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8')
fp.write(datalist[ii] + '\n') # 只包含文本
print(datalist[ii])
print(f'\n> > >{year}年,第{p}页,第{ii + 1}篇,成功! < < <')
fp.close()
m = m + len(datalist) + 1
print('----------------------------')
print(f'------\n{year}年,爬取完毕----')
print('----------------------------')
历史优化记录:01_中证网.py
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码
query = input("请输入你想搜索的内容:")
pages = int(input("要爬取的页数(不小于1):"))
if pages < 1:
exit()
url = f'search.cs.com.cn/search?channelid=215308&perpage=&templet=&token=12.1462412070719.47&searchword={query}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page={p+1}&channelid=215308&searchword={query}"
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page=1&channelid=215308&searchword={query}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page={p+1}&channelid=215308&searchword={query}"
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline==2021'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all('a')
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page={pages}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline==2020'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all('a')
print('alist:',alist)
weblist = []
for a in alist:
if a.get('href')[4:] == "search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find('tr').find_all('a')
# print('alist:', alist)
weblist = []
for a in alist:
if a.get('href')[:4] == "search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find_all("table")
datalist = []
for ii in alist:
ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #333333;margin-top: 4px;')
# print('ss=\n\n',ss)
if ss != None:
ss = ss.get_text()
datalist.append(ss)
# print('data:',datalist,len(datalist))
if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}'): # 如果没有此文件夹
os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}') # 创建此文件夹
for ii in range(len(datalist)):
fp = open(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8')
fp.write(datalist[ii] + '\n') # 只包含文本
print(datalist[ii])
print(f'\n> > >第{p}页,第{ii + 1}篇,成功! < < <')
fp.close()
m = m + len(datalist) + 1
print('----------------------------')
print(f'------\n{year}年,爬取完毕----')
print('----------------------------')
转载请注明出处,谢谢!!!
本文共计1651个文字,预计阅读时间需要7分钟。
python最终版:07_证书网(Plus+ - Pro).py
最终版:07_中证网(Plus -Pro).py# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
import os
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码
for qq in range(8):
# query = input("请输入你想搜索的内容:")
query = '苏州银行'
#年份
year = [2014,2015,2016,2017,2018,2019,2020,2021]
#总页数
pages = [2,1,1,1,11,1,19,7]
year = year[qq]
pages = pages[qq]
if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}'): # 如果没有此文件夹
os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}') # 创建此文件夹
m = 0
for p in range(1, pages + 1):
url = f'search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find_all("table")
datalist = []
for ii in alist:
ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #333333;margin-top: 4px;')
# print('ss=\n\n',ss)
if ss != None:
ss = ss.get_text()
datalist.append(ss)
# print('data:',datalist,len(datalist))
if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}'): # 如果没有此文件夹
os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}') # 创建此文件夹
for ii in range(len(datalist)):
fp = open(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8')
fp.write(datalist[ii] + '\n') # 只包含文本
print(datalist[ii])
print(f'\n> > >{year}年,第{p}页,第{ii + 1}篇,成功! < < <')
fp.close()
m = m + len(datalist) + 1
print('----------------------------')
print(f'------\n{year}年,爬取完毕----')
print('----------------------------')
历史优化记录:01_中证网.py
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码
query = input("请输入你想搜索的内容:")
pages = int(input("要爬取的页数(不小于1):"))
if pages < 1:
exit()
url = f'search.cs.com.cn/search?channelid=215308&perpage=&templet=&token=12.1462412070719.47&searchword={query}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page={p+1}&channelid=215308&searchword={query}"
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page=1&channelid=215308&searchword={query}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 "
"Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page={p+1}&channelid=215308&searchword={query}"
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all("a")
# print(alist)
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline==2021'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all('a')
weblist = []
for a in alist:
if a.get('href')[:5] == "search.cs.com.cn/search?page={pages}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline==2020'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find_all('a')
print('alist:',alist)
weblist = []
for a in alist:
if a.get('href')[4:] == "search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find("table").find('tr').find_all('a')
# print('alist:', alist)
weblist = []
for a in alist:
if a.get('href')[:4] == "search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}'
dic = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}
resp = requests.get(url, headers=dic, )
resp.encoding = 'utf-8'
# print(resp)
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
print(f'\n>>>--------------------第{p}页---------------------<<<\n')
# print(resp.text)
page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器
alist = page.find_all("table")
datalist = []
for ii in alist:
ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #333333;margin-top: 4px;')
# print('ss=\n\n',ss)
if ss != None:
ss = ss.get_text()
datalist.append(ss)
# print('data:',datalist,len(datalist))
if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}'): # 如果没有此文件夹
os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}') # 创建此文件夹
for ii in range(len(datalist)):
fp = open(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8')
fp.write(datalist[ii] + '\n') # 只包含文本
print(datalist[ii])
print(f'\n> > >第{p}页,第{ii + 1}篇,成功! < < <')
fp.close()
m = m + len(datalist) + 1
print('----------------------------')
print(f'------\n{year}年,爬取完毕----')
print('----------------------------')
转载请注明出处,谢谢!!!

