ignore script and style tags from content, and make newlines into spaces so words are not combined

This commit is contained in:
xfnw 2021-06-24 14:02:37 -04:00
parent 9a2ff10616
commit 9dcc667bc1

View file

@ -39,14 +39,14 @@ foreach ($arg as $url) {
if (!$file)
continue;
$title = page_title($file);
$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', "", strip_tags($file)));
$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', " ", strip_tags(preg_replace('/<(script|style)>(.*)<\/\1>/siU', ' ',$file))));
if (!$title || !$document) {
echo "no title!\n";
continue;
}
echo "title: ".$title."\n";
echo $document;
$stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)');
$stmt->execute([$title, $url, $document]);
}