您的位置:首页 > Web前端 > JavaScript

javascript 网页正文提取

2017-03-13 16:54 274 查看
写这个的原因,最近在改一个网页正文提取的插件,但找遍了网站就是没有JS版的,于是乎就找了个Java版的代码。进行了修改。
经测试 可用;
代码自取;
let content = getArticleContent($('body').html());

/**
* 摘取文章主体正文算法
* @param body
* @returns {正文}
*/
let getArticleContent = function (body) {
/**
* 行分块的大小(块大小=BLOCKS+1)
*/
let BLOCKS = 0;
/**
* 判断为正文的文字骤变率
*/
let CHANGE_RATE = 0.9;
/**
* 每行最小长度
*/
let MIN_LENGTH = 3;

let html = body;

/**
* 去除html标签
* @param html 请求获得的html文本
* @return 纯文本
*/
let deleteLabel = function (html) {
let regEx_script = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/g; // 定义script的正则表达式
let regEx_style = /<stype\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/stype>/g; // 定义style的正则表达式
let regEx_html = /<(?:.|\s)*?>/g; // 定义HTML标签的正则表达式

html = html.replace(regEx_script, "");
html = html.replace(regEx_style, "");
html = html.replace(regEx_html, "");
html = html.replace("((\r\n)|\n)[\\s\t ]*(\\1)+", "$1").replace("^((\r\n)|\n)", "");//去除空白行
html = html.replace("    +| +| +", ""); //去除空白
return html.trim();
};

let b_html = deleteLabel(html);

/**
* 将纯文本按BLOCKS分块
* @param text 纯文本
* @return 分块后的map集合,键即为块号,值为块内容
*/
let splitBlock = function (text) {
let groupMap = new Array();
let bais = text;
let br = text.split('\n');
let line = null,
blocksLine = "";
let theCount = 0,
groupCount = 0,
count = 0;//1.记录每次添加的行数;2.记录块号;3.记录总行数

for (let i = 0; i < br.length; i++) {
line = br[i];
if (line != '') {
if (line.length > MIN_LENGTH) {
if (theCount <= BLOCKS) {
blocksLine += line.trim();
theCount++;
}
else {
if (blocksLine != undefined) {
groupMap[groupCount] = blocksLine;
groupCount++;
blocksLine = line.trim();
theCount = 1;
}
}
count++;
}
}

}

if (theCount != 0 && blocksLine != undefined) {//加上没凑齐的给给定块数的
groupMap[groupCount + 1] = blocksLine;
}

return groupMap;
};

let o_html = splitBlock(b_html);

/**
* 分析每块之间变化的情况
* @param map 块集合
* @return 正文
*/
let judgeBlocks = function (map) {
let sets = map;
let contentBlock = [];
let currentBlock = map.length; //当前行的长度
let lastBlock = 0; //上一行的长度
for (let i = 0; i < sets.length; i++) {
if (sets[i] != undefined) {
lastBlock = currentBlock;
currentBlock = sets[i].length;
let between = Math.abs(currentBlock - lastBlock) / Math.max(currentBlock, lastBlock);

if (between >= CHANGE_RATE) {
contentBlock.push(i);
}
}
}

//下面是取多个峰值节点中两个节点之间内容长度最大的内容
let matchNode = contentBlock.length;

let lastContent = 0;//前一个两节点之间的内容长度
let context = null;//结果
if (matchNode > 2) {
for (let i = 1; i < matchNode; i++) {
let result = "";
for (let j = contentBlock[i - 1]; j < contentBlock[i]; j++) {
result += map[j];
}
if (result.length > lastContent) {
lastContent = result.length;
context += result;
}

}
}

return context;
};

let articleContent = judgeBlocks(o_html);

return articleContent;
};
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  Web前端 javascript