如何编写一个针对cppreference.com的长尾词爬虫?

2026-04-06 19:511阅读0评论SEO资讯
  • 内容介绍
  • 文章标签
  • 相关推荐

本文共计436个文字,预计阅读时间需要2分钟。

如何编写一个针对cppreference.com的长尾词爬虫?

javascript使用 cheerio 和 sync-request 库,通过 fs 模块读取文件,实现数组去重功能。

如何编写一个针对cppreference.com的长尾词爬虫?

cpp_toc.js

var cheerio = require('cheerio') var request = require('sync-request') var fs = require('fs') function arrayUnique(arr) { var res = []; var s = new Set(); for(var e of arr) { if(!s.has(e)) { res.push(e) s.add(e) } } return res; } function getNext(url) { var html = request('GET', url).getBody().toString(); var $ = cheerio.load(html) var $links = $('a') var links = [] for(var i = 0; i < $links.length; i++) links.push($links.eq(i).attr('href')) links = links.filter(s => s) .map(s => s.startsWith('/')? 'zh.cppreference.com' + s: s) .filter(s => s.startsWith(url)) .filter(s => s != url) .filter(s => /^\/[^\/]+$/.test(s.replace(url, ''))) .filter(s => s.indexOf('#') == -1) .filter(s => s.indexOf('/experimental') == -1); console.log(arrayUnique(links)) return arrayUnique(links); } //var url = 'zh.cppreference.com/w/cpp'; var url = 'zh.cppreference.com/w/c'; var ofs = fs.openSync('out.txt', 'w') var stk = [url] while(stk.length != 0) { url = stk.pop(); fs.writeSync(ofs, url + '\n') console.log(url) for(var nxt of getNext(url).reverse()) { stk.push(nxt) } } console.log('done') cpp.js

var cheerio = require('cheerio'); var request = require('sync-request'); var fs = require('fs'); var process = require('process'); var ofile = fs.openSync('out.html', 'a'); var start; if (fs.existsSync('out.idx')) { start = fs.readFileSync('out.idx') start = Number.parseInt(start) + 1 } else start = 0; var toc = getToc(); for (var i = start; i < toc.length; i++) { try { var url = toc[i] console.log('page: ' + url); html = request('GET', url).getBody().toString(); var content = getContent(html); fs.writeSync(ofile, content, null, 'utf-8'); fs.writeSync(ofile, '\n', null, 'utf-8'); var hisLink = getHisLink(html); html = request('GET', hisLink).getBody().toString(); content = getHisContent(html); fs.writeSync(ofile, content, null, 'utf-8'); fs.writeSync(ofile, '\n \n', null, 'utf-8'); fs.writeFileSync('out.idx', i.toString()) } catch (ex) { console.log(ex); i--; } } fs.closeSync(ofile); console.log('Done..'); function getToc() { return fs.readFileSync('assets/cpp_toc.txt', 'utf-8') .split(/\n/g).filter(s => s); } function getContent(html) { var $ = cheerio.load(html); $('.t-navbar, .editsection').remove(); var title = $('#firstHeading').toString(); var content = $('#mw-content-text').html(); return title + content; } function getHisLink(html) { var $ = cheerio.load(html); return 'zh.cppreference.com' + $('#ca-history a').attr('href') } function getHisContent(html) { var $ = cheerio.load(html); $('input, .editsection, .external').remove() $('.mw-history-undo').replaceWith('撤销'); return '

版本历史

\n' + $('#pagehistory').html(); }

本文共计436个文字,预计阅读时间需要2分钟。

如何编写一个针对cppreference.com的长尾词爬虫?

javascript使用 cheerio 和 sync-request 库,通过 fs 模块读取文件,实现数组去重功能。

如何编写一个针对cppreference.com的长尾词爬虫?

cpp_toc.js

var cheerio = require('cheerio') var request = require('sync-request') var fs = require('fs') function arrayUnique(arr) { var res = []; var s = new Set(); for(var e of arr) { if(!s.has(e)) { res.push(e) s.add(e) } } return res; } function getNext(url) { var html = request('GET', url).getBody().toString(); var $ = cheerio.load(html) var $links = $('a') var links = [] for(var i = 0; i < $links.length; i++) links.push($links.eq(i).attr('href')) links = links.filter(s => s) .map(s => s.startsWith('/')? 'zh.cppreference.com' + s: s) .filter(s => s.startsWith(url)) .filter(s => s != url) .filter(s => /^\/[^\/]+$/.test(s.replace(url, ''))) .filter(s => s.indexOf('#') == -1) .filter(s => s.indexOf('/experimental') == -1); console.log(arrayUnique(links)) return arrayUnique(links); } //var url = 'zh.cppreference.com/w/cpp'; var url = 'zh.cppreference.com/w/c'; var ofs = fs.openSync('out.txt', 'w') var stk = [url] while(stk.length != 0) { url = stk.pop(); fs.writeSync(ofs, url + '\n') console.log(url) for(var nxt of getNext(url).reverse()) { stk.push(nxt) } } console.log('done') cpp.js

var cheerio = require('cheerio'); var request = require('sync-request'); var fs = require('fs'); var process = require('process'); var ofile = fs.openSync('out.html', 'a'); var start; if (fs.existsSync('out.idx')) { start = fs.readFileSync('out.idx') start = Number.parseInt(start) + 1 } else start = 0; var toc = getToc(); for (var i = start; i < toc.length; i++) { try { var url = toc[i] console.log('page: ' + url); html = request('GET', url).getBody().toString(); var content = getContent(html); fs.writeSync(ofile, content, null, 'utf-8'); fs.writeSync(ofile, '\n', null, 'utf-8'); var hisLink = getHisLink(html); html = request('GET', hisLink).getBody().toString(); content = getHisContent(html); fs.writeSync(ofile, content, null, 'utf-8'); fs.writeSync(ofile, '\n \n', null, 'utf-8'); fs.writeFileSync('out.idx', i.toString()) } catch (ex) { console.log(ex); i--; } } fs.closeSync(ofile); console.log('Done..'); function getToc() { return fs.readFileSync('assets/cpp_toc.txt', 'utf-8') .split(/\n/g).filter(s => s); } function getContent(html) { var $ = cheerio.load(html); $('.t-navbar, .editsection').remove(); var title = $('#firstHeading').toString(); var content = $('#mw-content-text').html(); return title + content; } function getHisLink(html) { var $ = cheerio.load(html); return 'zh.cppreference.com' + $('#ca-history a').attr('href') } function getHisContent(html) { var $ = cheerio.load(html); $('input, .editsection, .external').remove() $('.mw-history-undo').replaceWith('撤销'); return '

版本历史

\n' + $('#pagehistory').html(); }