如何用Python编写针对runoob网站的长尾关键词爬虫?

2026-04-06 19:571阅读0评论SEO教程
  • 内容介绍
  • 文章标签
  • 相关推荐

本文共计209个文字,预计阅读时间需要1分钟。

如何用Python编写针对runoob网站的长尾关键词爬虫?

javascriptimport cheerio from 'cheerio';import request from 'sync-request';import fs from 'fs';import process from 'process';

const url=process.argv[2];if (!url) { console.log('请指定页面。'); process.exit(1);}

const ofile=fs.openSync('output.txt', 'w');

runoob.js

var cheerio = require('cheerio'); var request = require('sync-request'); var fs = require('fs'); var process = require('process'); var url = process.argv[2]; if(!url) { console.log('请指定页面。'); process.exit(0); } var ofile = fs.openSync('out.html', 'w'); var html = request('GET', url).getBody().toString(); var toc = getToc(html); for(var i in toc) { try { var url = toc[i]; console.log('page: ' + url); html = request('GET', url).getBody().toString(); var content = getContent(html); fs.writeSync(ofile, content, null, 'utf-8'); fs.writeSync(ofile, '\n \n', null, 'utf-8'); } catch(ex) { console.log(ex); } } fs.closeSync(ofile); console.log('Done..'); function getToc(html) { var $ = cheerio.load(html); var $list = $('#leftcolumn').find('a'); var res = []; for(var i = 0; i < $list.length; i++) { var url = $list.eq(i).attr('href'); res.push('www.runoob.com/' + url); } return res; } function getContent(html) { var $ = cheerio.load(html); $('.tryitbtn').remove(); var content = $('#content').html(); return content; }

如何用Python编写针对runoob网站的长尾关键词爬虫?

本文共计209个文字,预计阅读时间需要1分钟。

如何用Python编写针对runoob网站的长尾关键词爬虫?

javascriptimport cheerio from 'cheerio';import request from 'sync-request';import fs from 'fs';import process from 'process';

const url=process.argv[2];if (!url) { console.log('请指定页面。'); process.exit(1);}

const ofile=fs.openSync('output.txt', 'w');

runoob.js

var cheerio = require('cheerio'); var request = require('sync-request'); var fs = require('fs'); var process = require('process'); var url = process.argv[2]; if(!url) { console.log('请指定页面。'); process.exit(0); } var ofile = fs.openSync('out.html', 'w'); var html = request('GET', url).getBody().toString(); var toc = getToc(html); for(var i in toc) { try { var url = toc[i]; console.log('page: ' + url); html = request('GET', url).getBody().toString(); var content = getContent(html); fs.writeSync(ofile, content, null, 'utf-8'); fs.writeSync(ofile, '\n \n', null, 'utf-8'); } catch(ex) { console.log(ex); } } fs.closeSync(ofile); console.log('Done..'); function getToc(html) { var $ = cheerio.load(html); var $list = $('#leftcolumn').find('a'); var res = []; for(var i = 0; i < $list.length; i++) { var url = $list.eq(i).attr('href'); res.push('www.runoob.com/' + url); } return res; } function getContent(html) { var $ = cheerio.load(html); $('.tryitbtn').remove(); var content = $('#content').html(); return content; }

如何用Python编写针对runoob网站的长尾关键词爬虫?