好(0) 差(0) 阅读(651) 评论(0)
才写了篇《决战RSS垃圾代码》,新订一个feed,又是源自MSN Space,又多了一堆新垃圾,又要开工奋战我的微型Tidy了。
经过半小时的浴血奋战,消灭新的垃圾代码。新版的微型Tidy如下:
function clean_summary($strSummary) {
$arrMatchings = array(
array("<", ">")
// 因为有“&”实体检测,不需要检测截断的实体了
//array("&", ";")
);
foreach ($arrMatchings as $arrMatching) {
$intL = strrpos($strSummary, $arrMatching[0]);
if ($intL !== false) {
$intR = strrpos($strSummary, $arrMatching[1]);
if ($intL > $intR) $strSummary = substr($strSummary, 0, $intL);
}
}
$strSummary = str_replace(array(
"<br>", "<br/>", "</font>", "</span>"
), array(
"<br />", "<br />", "", ""
), $strSummary);
$arrPatterns = array(
"#<script[.\s]*</script>#is",
"#<script.*/>#is",
"#<iframe[.\s]*</iframe>#is",
"#<iframe.*/>#is",
'#<img[^>]*dynsrc[^>]*>#is', // 拿img标签来播音乐,真是开玩笑,删掉
"#<img([^>]*)([^/])>#is",
'#(height|width)="?(\d+)(px)?"?#is',
"#(id|border)=[^\s/]*#is",
'#style="[^"]*"#is',
"#<font[^>]*>#is",
"#<span[^>]*>#is",
"#<div[^>]*></div>#is",
"#<p[^>]*></p>#is",
'#<img[^>]*height="0"[^>]*width="0"[^>]*>#is'
);
$arrReplacements = array(
"", "", "", "", "",
"<img\1\2 />",
'\1="\2"',
"", "", "", "", "", "", ""
);
$strSummary = preg_replace($arrPatterns, $arrReplacements, $strSummary);
$intAnd = -1;
while (($intAnd = strpos($strSummary, "&", $intAnd + 1)) !== false) {
if (($intSemicolon = strpos($strSummary, ";", $intAnd)) !== false) {
if ($intSemicolon - $intAnd > 6) {
$strSummary = substr_replace($strSummary, "&", $intAnd, 1);
continue;
}
$strEntity = substr($strSummary, $intAnd, $intSemicolon - $intAnd + 1);
$arrEntities = array(
" ", "&", "<", ">", """
);
$boolNotEscaped = true;
foreach ($arrEntities as $e) {
if ($strEntity == $e) {
$boolNotEscaped = false;
break;
}
}
if (!$boolNotEscaped) continue;
if (!preg_match("|^&#\d+;|", $strEntity)) {
$strSummary = substr_replace($strSummary, "&", $intAnd, 1);
}
} else $strSummary = substr_replace($strSummary, "&", $intAnd, 1);
}
// 检测img标签没有alt属性的,添加上
$intStart = -1;
while (($intStart = strpos($strSummary, "<img", $intStart + 1)) !== false) {
$intEnd = strpos($strSummary, ">", $intStart);
$strTag = substr($strSummary, $intStart, $intEnd - $intStart + 1);
if (false !== strpos($strTag, "alt=")) {
$intStart = $intEnd;
continue;
}
$strSummary = substr_replace($strSummary, '<img alt=""', $intStart, 4);
}
return $strSummary;
}最后修改:Wen 于 2005-08-20 10:08:09
用户登录



