如何使用Python的urllib和lxml进行小说内容爬取?
- 内容介绍
- 文章标签
- 相关推荐
本文共计309个文字,预计阅读时间需要2分钟。
pythonimport urllib.parseimport urllib.requestfrom lxml import etreeimport time
class Novel: def __init__(self, name, dict): self.name=name self.dict=dict self.txt='' for key in sorted(self.dict): self.txt +=key + ' '
from urllib import parse from urllib import request from lxml import etree import time class Novel: def __init__(self,*args): self.name = args[0] self.dict = args[1] self.txt = ‘‘ for key in sorted(self.dict): self.txt = self.txt + self.dict[key] def write(self): f = open(self.name+‘.txt‘,‘w‘) f.write(self.txt) f.close() #获取网页源代码 def get_m.wenxuemi6.com{}‘.format(option.get(‘value‘))) return url_list def downdload_txt(url_list,**kw): if kw: start = int(kw[‘start‘]) stop = int (kw[‘stop‘]) if start >= 0 and start < len(url_list) and stop > start and stop <len(url_list): count = kw[‘start‘] count_max = kw[‘stop‘] else: count = 0 count_max = len(url_list) print(‘正在爬取目录和章节地址,请稍等……‘) d = {} while count < count_max: url = url_list[count] page = get_m.wenxuemi6.com{}‘.format(l.get(‘href‘)) #url_list.append(‘m.wenxuemi6.com{}‘.format(l.get(‘href‘))) print(‘Download chapters by URL:{}‘.format(url)) d2 = {‘{}‘.format(count): ‘‘} page = get_m.wenxuemi6.com{}‘.format(l.get(‘href‘)[:-5] + ‘_2.html‘) print(‘Download chapters by URL:{}‘.format(url)) page = get_m.wenxuemi6.com/search.php?keyword={}‘.format(parse.quote(txt_name)) referer = url url_list = get_comics_directory(url) #下载第一页目录下的小说 d = downdload_txt(url_list,start=0,stop=1) n1 = Novel(txt_name,d) #写出文件 [txt_name].txt 到当前目录下 n1.write() #下载全本小说 d2 = downdload_txt(url_list,start=0,stop=1) n2 = Novel(txt_name,d2) #写出文件 [txt_name].txt 到当前目录下 n2.write()
本文共计309个文字,预计阅读时间需要2分钟。
pythonimport urllib.parseimport urllib.requestfrom lxml import etreeimport time
class Novel: def __init__(self, name, dict): self.name=name self.dict=dict self.txt='' for key in sorted(self.dict): self.txt +=key + ' '
from urllib import parse from urllib import request from lxml import etree import time class Novel: def __init__(self,*args): self.name = args[0] self.dict = args[1] self.txt = ‘‘ for key in sorted(self.dict): self.txt = self.txt + self.dict[key] def write(self): f = open(self.name+‘.txt‘,‘w‘) f.write(self.txt) f.close() #获取网页源代码 def get_m.wenxuemi6.com{}‘.format(option.get(‘value‘))) return url_list def downdload_txt(url_list,**kw): if kw: start = int(kw[‘start‘]) stop = int (kw[‘stop‘]) if start >= 0 and start < len(url_list) and stop > start and stop <len(url_list): count = kw[‘start‘] count_max = kw[‘stop‘] else: count = 0 count_max = len(url_list) print(‘正在爬取目录和章节地址,请稍等……‘) d = {} while count < count_max: url = url_list[count] page = get_m.wenxuemi6.com{}‘.format(l.get(‘href‘)) #url_list.append(‘m.wenxuemi6.com{}‘.format(l.get(‘href‘))) print(‘Download chapters by URL:{}‘.format(url)) d2 = {‘{}‘.format(count): ‘‘} page = get_m.wenxuemi6.com{}‘.format(l.get(‘href‘)[:-5] + ‘_2.html‘) print(‘Download chapters by URL:{}‘.format(url)) page = get_m.wenxuemi6.com/search.php?keyword={}‘.format(parse.quote(txt_name)) referer = url url_list = get_comics_directory(url) #下载第一页目录下的小说 d = downdload_txt(url_list,start=0,stop=1) n1 = Novel(txt_name,d) #写出文件 [txt_name].txt 到当前目录下 n1.write() #下载全本小说 d2 = downdload_txt(url_list,start=0,stop=1) n2 = Novel(txt_name,d2) #写出文件 [txt_name].txt 到当前目录下 n2.write()

