使用NODE写爬虫,抓取搜狗词库

使用到的库

  1. cheerio (俗称node版的JQuery)
  2. axios (网络请求库)

获取所有词库列表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import * as cheerio from 'cheerio';
import axios from 'axios';

const host = 'https://pinyin.sogou.com';

export async function getLexicons() {
console.info('开始爬取搜狗词库列表...');
const startTime = new Date();
const result = [];
let $ = await getPage('/dict/cate/index');
const cateUrls = [];
$('.nav_list a').each((idx, ele) => cateUrls.push($(ele).attr('href')));
for (let pageUrl of cateUrls) {
const cateId = pageUrl.split('/cate/index/')[1].split('?')[0];
while (pageUrl) {
$ = await getPage(pageUrl);
$('.dict_detail_block').each((idx, ele) => {
const titleEle = $(ele).children('.dict_detail_title_block').first().children().first().children().first();
const name = $(titleEle).text();
const hrefSplit = $(titleEle).attr('href').split('/');
const cid = hrefSplit[hrefSplit.length - 1];
const updateTime = $(ele).children('.dict_detail_show').first().children().first().children().last().text();
result.push({
cid,
name,
cateId,
version: updateTime,
});
});
$('#dict_page_list ul li span a').each((idx, ele) => {
if (($(ele).text() === '下一页')) {
pageUrl = $(ele).attr('href');
} else {
pageUrl = null;
}
});
}
}
console.info(`爬取搜狗词库列表完成,共${result.length}条,耗时 ${new Date().getTime() - startTime.getTime()}ms`);
return result;
}

async function getPage(url: string): Promise<CheerioStatic> {
try {
const res = await axios.get(host + url);
return cheerio.load(res.data);
} catch (error) {
console.warn(error.message);
console.info('重试中...');
return getPage(url);
}
}

下载并解析词库文件

使用SCEL包解析搜狗词库

1
git@github.com:coldcafe/SCEL.git

获取词库中的所有词

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
async getWords(id: string, name: string): Promise<any[]> {
console.time('加载词库');
const url = `http://download.pinyin.sogou.com/dict/download_cell.php?id=${d}&name=${name}`;
return new Promise((resolve, reject) => {
try {
http.get(encodeURI(url), res => {
const data = [];
res.on('data', [].push.bind(data)).on('end', () => {
const buf = Buffer.concat(data);
const dict = SCEL.parseBuffer(buf);
console.timeEnd('加载词库');
resolve(dict.words);
});
});
} catch (error) {
reject(error);
}
});
}

分享到