ignore script and style tags from content, and make newlines into spaces so words are not combined
This commit is contained in:
parent
9a2ff10616
commit
9dcc667bc1
1 changed files with 2 additions and 2 deletions
|
@ -39,14 +39,14 @@ foreach ($arg as $url) {
|
||||||
if (!$file)
|
if (!$file)
|
||||||
continue;
|
continue;
|
||||||
$title = page_title($file);
|
$title = page_title($file);
|
||||||
$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', "", strip_tags($file)));
|
$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', " ", strip_tags(preg_replace('/<(script|style)>(.*)<\/\1>/siU', ' ',$file))));
|
||||||
if (!$title || !$document) {
|
if (!$title || !$document) {
|
||||||
echo "no title!\n";
|
echo "no title!\n";
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "title: ".$title."\n";
|
echo "title: ".$title."\n";
|
||||||
|
echo $document;
|
||||||
$stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)');
|
$stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)');
|
||||||
$stmt->execute([$title, $url, $document]);
|
$stmt->execute([$title, $url, $document]);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue