2020-12-14 21:59:16 +00:00
|
|
|
<?php
|
|
|
|
ini_set('display_errors', '1');
|
|
|
|
ini_set('display_startup_errors', '1');
|
|
|
|
error_reporting(E_ALL);
|
|
|
|
|
|
|
|
$db = new PDO("sqlite:db.sqlite");
|
|
|
|
|
2021-10-20 20:41:36 +00:00
|
|
|
$context = stream_context_create(
|
|
|
|
array(
|
|
|
|
'http' => array(
|
|
|
|
'follow_location' => false,
|
|
|
|
'timeout' => 2,
|
|
|
|
'user_agent' => 'searplbot/1.0'
|
|
|
|
)
|
|
|
|
)
|
|
|
|
);
|
2020-12-14 21:59:16 +00:00
|
|
|
|
|
|
|
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING );
|
|
|
|
//$stmt = $db->prepare($sql);
|
|
|
|
//$stmt->execute($params);
|
|
|
|
|
|
|
|
function page_title($fp) {
|
|
|
|
$res = preg_match("/<title>(.*)<\/title>/siU", $fp, $title_matches);
|
|
|
|
if (!$res)
|
|
|
|
return null;
|
|
|
|
|
|
|
|
// Clean up title: remove EOL's and excessive whitespace.
|
|
|
|
$title = preg_replace('/\s+/', ' ', $title_matches[1]);
|
|
|
|
$title = trim($title);
|
|
|
|
return $title;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$arg = $argv;
|
|
|
|
array_shift($arg);
|
|
|
|
|
|
|
|
foreach ($arg as $url) {
|
2021-01-08 22:35:14 +00:00
|
|
|
echo "\n";
|
2021-01-08 21:54:16 +00:00
|
|
|
echo $url."\n";
|
2021-01-28 01:33:25 +00:00
|
|
|
|
|
|
|
$stmt = $db->prepare('DELETE FROM indexed WHERE url = ?');
|
2021-10-20 20:41:36 +00:00
|
|
|
$stmt->execute([htmlspecialchars(htmlspecialchars_decode($url))]);
|
2021-01-28 01:33:25 +00:00
|
|
|
|
2021-10-22 15:43:33 +00:00
|
|
|
$file = file_get_contents($url, false, $context, 0, 1000000);
|
2021-10-20 20:41:36 +00:00
|
|
|
if (!$file || strpos($http_response_header[0],'200 OK') === false)
|
2020-12-14 21:59:16 +00:00
|
|
|
continue;
|
|
|
|
$title = page_title($file);
|
2021-06-24 18:02:37 +00:00
|
|
|
$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', " ", strip_tags(preg_replace('/<(script|style)>(.*)<\/\1>/siU', ' ',$file))));
|
2021-01-08 21:54:16 +00:00
|
|
|
if (!$title || !$document) {
|
|
|
|
echo "no title!\n";
|
2020-12-14 21:59:16 +00:00
|
|
|
continue;
|
2021-01-08 21:54:16 +00:00
|
|
|
}
|
|
|
|
|
2021-01-08 22:35:14 +00:00
|
|
|
echo "title: ".$title."\n";
|
2021-06-24 18:08:08 +00:00
|
|
|
|
2020-12-14 21:59:16 +00:00
|
|
|
$stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)');
|
2021-12-11 17:50:41 +00:00
|
|
|
$stmt->execute([htmlspecialchars(str_replace('—','—',htmlspecialchars_decode($title))), htmlspecialchars(str_replace('—','—',htmlspecialchars_decode($url))), htmlspecialchars(str_replace('—','—',htmlspecialchars_decode($document)))]);
|
2020-12-14 21:59:16 +00:00
|
|
|
}
|