ignore script and style tags from content, and make newlines into spaces so words are not combined
This commit is contained in:
parent
9a2ff10616
commit
9dcc667bc1
1 changed files with 2 additions and 2 deletions
|
@ -39,14 +39,14 @@ foreach ($arg as $url) {
|
|||
if (!$file)
|
||||
continue;
|
||||
$title = page_title($file);
|
||||
$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', "", strip_tags($file)));
|
||||
$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', " ", strip_tags(preg_replace('/<(script|style)>(.*)<\/\1>/siU', ' ',$file))));
|
||||
if (!$title || !$document) {
|
||||
echo "no title!\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
echo "title: ".$title."\n";
|
||||
|
||||
echo $document;
|
||||
$stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)');
|
||||
$stmt->execute([$title, $url, $document]);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue