use sqlite's FTS5 as the ranking algorithm

This commit is contained in:
xfnw 2021-10-20 16:41:36 -04:00
parent 4643aeec70
commit f23fdaad28
4 changed files with 28 additions and 59 deletions

View file

@ -5,7 +5,15 @@ error_reporting(E_ALL);
$db = new PDO("sqlite:db.sqlite");
$context = stream_context_create(
array(
'http' => array(
'follow_location' => false,
'timeout' => 2,
'user_agent' => 'searplbot/1.0'
)
)
);
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING );
//$stmt = $db->prepare($sql);
@ -29,14 +37,13 @@ array_shift($arg);
foreach ($arg as $url) {
echo "\n";
$url = preg_replace('/\/$/','',$url);
echo $url."\n";
$stmt = $db->prepare('DELETE FROM indexed WHERE url = ?');
$stmt->execute([$url]);
$stmt->execute([htmlspecialchars(htmlspecialchars_decode($url))]);
$file = file_get_contents($url);
if (!$file)
$file = file_get_contents($url, false, $context);
if (!$file || strpos($http_response_header[0],'200 OK') === false)
continue;
$title = page_title($file);
$document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', " ", strip_tags(preg_replace('/<(script|style)>(.*)<\/\1>/siU', ' ',$file))));
@ -48,5 +55,5 @@ foreach ($arg as $url) {
echo "title: ".$title."\n";
$stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)');
$stmt->execute([$title, $url, $document]);
$stmt->execute([htmlspecialchars(htmlspecialchars_decode($title)), htmlspecialchars(htmlspecialchars_decode($url)), htmlspecialchars(htmlspecialchars_decode($document))]);
}

View file

@ -1,2 +1,2 @@
CREATE TABLE indexed (id INTEGER PRIMARY KEY, title VARCHAR(255), url VARCHAR(512) UNIQUE, content TEXT)
CREATE VIRTUAL TABLE indexed USING FTS5(title, url, content);

View file

@ -26,62 +26,27 @@ $db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING );
if (isset($_GET['q']) && preg_replace('/\s+/', '', $_GET['q']) != '') {
$sql = 'SELECT * FROM indexed WHERE 1=1';
$terms = explode(' ', trim(preg_replace('/\s+/', ' ', $_GET['q'])));
$params = array();
foreach ($terms as $term) {
if (substr($term, 0, 1) == '-') {
$sql = $sql . ' AND content NOT LIKE ?';
array_push($params,'%'.substr($term,1).'%');
} else {
$sql = $sql . ' AND content LIKE ?';
array_push($params,'%'.$term.'%');
}
}
$sql = $sql . ';';
$sql = "SELECT title,url,snippet(indexed,2,'<b>','</b>','...',15) as snippet FROM indexed WHERE indexed MATCH ? ORDER BY bm25(indexed,2,2,1)";
$params = [$_GET['q']];
$stmt = $db->prepare($sql);
set_error_handler(function ($_,$msg) {echo '<div class="box">'.substr($msg,65).'. you may want to view <a href="https://www.sqlite.org/fts5.html#full_text_query_syntax">this documentation</a> on writing valid queries.</div>';}, E_WARNING);
$stmt->execute($params);
$rows = array();
$scores = array();
while ($row = $stmt->fetch()) {
$score = 0;
foreach ($terms as $param)
$score = $score + 100*(substr_count(strtolower($row['content']),strtolower($param)) / strlen($row['content']));
$score = $score + 5000*(substr_count(strtolower($row['url']),strtolower($param)) / strlen($row['url']));
$score = $score + 3000*(substr_count(strtolower($row['title']),strtolower($param)) / strlen($row['title']));
array_push($scores, $score);
$row['score'] = $score;
array_push($rows, $row);
}
array_multisort($scores, SORT_DESC, $rows);
restore_error_handler();
$results = false;
foreach ($rows as $row) {
while ($row = $stmt->fetch()) {
$results = true;
?>
<div class='box'>
<a href="<?php echo htmlspecialchars(htmlspecialchars_decode($row['url'])); ?>"><?php echo htmlspecialchars(htmlspecialchars_decode($row['title'])); ?></a>
<a href="<?php echo $row['url']; ?>"><?php echo $row['title']; ?></a>
<br>
<small>(score: <?php echo round($row['score']); ?>) <?php echo htmlspecialchars(htmlspecialchars_decode($row['url'])); ?></small>
<small><?php echo $row['url']; ?></small>
<br>
...<?php
$content = $row['content'];
foreach ($terms as $param) {
$pos = strpos(strtolower($content), strtolower($param));
if ($pos !== false) {
echo htmlspecialchars(htmlspecialchars_decode(substr($content,$pos-50,50)));
echo '<strong>'.htmlspecialchars(htmlspecialchars_decode($param)).'</strong>';
echo htmlspecialchars(htmlspecialchars_decode(substr($content,$pos+strlen($param),50))).'...';
}
}
<?php
echo $row['snippet'];
?>
</div>
<?php
@ -95,22 +60,19 @@ if (isset($_GET['q']) && preg_replace('/\s+/', '', $_GET['q']) != '') {
<div class='box'>
<h2>welcome to searpl</h2>
i am an <a href='https://github.com/xfnw/searpl'>open source</a> search
i am a simple, <a href='https://github.com/xfnw/searpl'>open source</a> search
engine that can find stuff :3
</div>
<div class='box'>
normal words inputted will be tags, a -tag will blacklist the tag and
there is also unsorted SQL LIKE syntax.
<br>
more stuff like site: coming soon!
queries use <a href='https://www.sqlite.org/fts5.html#full_text_query_syntax'>FTS syntax</a>.
</div>
<div class='box'>
i have
<strong>
<?php
echo $db->query('SELECT id FROM indexed ORDER BY id DESC LIMIT 1')->fetchColumn();
echo $db->query('SELECT rowid FROM indexed ORDER BY rowid DESC LIMIT 1')->fetchColumn();
?>
</strong> pages indexed, using <strong>
<?php

View file

@ -1,7 +1,7 @@
wget -w 2 --random-wait --spider --force-html --tries 1 --timeout 2 -r -l1 -H -U 'searplbot/1.0' $@ 2>&1 | tee -a wg
grep '^--' wg | awk '{ print $3 }' \
| grep -v '\.\(css\|js\|png\|gif\|jpg\|txt\|ico\|ttf\|svg\)$' \
| grep -v '\.\(css\|js\|png\|gif\|jpg\|txt\|ico\|ttf\|svg\|rss\|atom\)$' \
| sort | uniq \
| tee -a ur