searpl/crawl.php

52 lines
1.2 KiB
PHP
Raw Normal View History

2020-12-14 21:59:16 +00:00
<?php
ini_set('display_errors', '1');
ini_set('display_startup_errors', '1');
error_reporting(E_ALL);
$db = new PDO("sqlite:db.sqlite");
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING );
//$stmt = $db->prepare($sql);
//$stmt->execute($params);
function page_title($fp) {
$res = preg_match("/<title>(.*)<\/title>/siU", $fp, $title_matches);
if (!$res)
return null;
// Clean up title: remove EOL's and excessive whitespace.
$title = preg_replace('/\s+/', ' ', $title_matches[1]);
$title = trim($title);
return $title;
}
$arg = $argv;
array_shift($arg);
foreach ($arg as $url) {
2021-01-08 22:35:14 +00:00
echo "\n";
$url = preg_replace('/\/$/','',$url);
2021-01-08 21:54:16 +00:00
echo $url."\n";
2020-12-14 21:59:16 +00:00
$file = file_get_contents($url);
if (!$file)
continue;
$title = page_title($file);
$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', "", strip_tags($file)));
2021-01-08 21:54:16 +00:00
if (!$title || !$document) {
echo "no title!\n";
2020-12-14 21:59:16 +00:00
continue;
2021-01-08 21:54:16 +00:00
}
2021-01-08 22:35:14 +00:00
echo "title: ".$title."\n";
2020-12-22 15:13:45 +00:00
$stmt = $db->prepare('DELETE FROM indexed WHERE url = ?');
$stmt->execute([$url]);
2020-12-14 21:59:16 +00:00
$stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)');
$stmt->execute([$title, $url, $document]);
}