From f23fdaad28ea18cfe213a58bad3bdd06bbe4540e Mon Sep 17 00:00:00 2001 From: xfnw Date: Wed, 20 Oct 2021 16:41:36 -0400 Subject: [PATCH] use sqlite's FTS5 as the ranking algorithm --- crawl.php | 19 +++++++++++----- create.sql | 2 +- index.php | 64 +++++++++++------------------------------------------- urls.sh | 2 +- 4 files changed, 28 insertions(+), 59 deletions(-) diff --git a/crawl.php b/crawl.php index ce55148..d19e0a4 100644 --- a/crawl.php +++ b/crawl.php @@ -5,7 +5,15 @@ error_reporting(E_ALL); $db = new PDO("sqlite:db.sqlite"); - +$context = stream_context_create( + array( + 'http' => array( + 'follow_location' => false, + 'timeout' => 2, + 'user_agent' => 'searplbot/1.0' + ) + ) +); $db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING ); //$stmt = $db->prepare($sql); @@ -29,14 +37,13 @@ array_shift($arg); foreach ($arg as $url) { echo "\n"; - $url = preg_replace('/\/$/','',$url); echo $url."\n"; $stmt = $db->prepare('DELETE FROM indexed WHERE url = ?'); - $stmt->execute([$url]); + $stmt->execute([htmlspecialchars(htmlspecialchars_decode($url))]); - $file = file_get_contents($url); - if (!$file) + $file = file_get_contents($url, false, $context); + if (!$file || strpos($http_response_header[0],'200 OK') === false) continue; $title = page_title($file); $document = preg_replace('/[ \t]+/', ' ', preg_replace('/[\r\n]+/', " ", strip_tags(preg_replace('/<(script|style)>(.*)<\/\1>/siU', ' ',$file)))); @@ -48,5 +55,5 @@ foreach ($arg as $url) { echo "title: ".$title."\n"; $stmt = $db->prepare('INSERT INTO indexed (title, url, content) VALUES (?, ?, ?)'); - $stmt->execute([$title, $url, $document]); + $stmt->execute([htmlspecialchars(htmlspecialchars_decode($title)), htmlspecialchars(htmlspecialchars_decode($url)), htmlspecialchars(htmlspecialchars_decode($document))]); } diff --git a/create.sql b/create.sql index f1d23b7..f215edf 100644 --- a/create.sql +++ b/create.sql @@ -1,2 +1,2 @@ -CREATE TABLE indexed (id INTEGER PRIMARY KEY, title VARCHAR(255), url VARCHAR(512) UNIQUE, content TEXT) +CREATE VIRTUAL TABLE indexed USING FTS5(title, url, content); diff --git a/index.php b/index.php index a0902bc..0b6f2f3 100644 --- a/index.php +++ b/index.php @@ -26,62 +26,27 @@ $db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING ); if (isset($_GET['q']) && preg_replace('/\s+/', '', $_GET['q']) != '') { - $sql = 'SELECT * FROM indexed WHERE 1=1'; - - $terms = explode(' ', trim(preg_replace('/\s+/', ' ', $_GET['q']))); - $params = array(); - foreach ($terms as $term) { - if (substr($term, 0, 1) == '-') { - - $sql = $sql . ' AND content NOT LIKE ?'; - array_push($params,'%'.substr($term,1).'%'); - } else { - - $sql = $sql . ' AND content LIKE ?'; - array_push($params,'%'.$term.'%'); - } - } - $sql = $sql . ';'; + $sql = "SELECT title,url,snippet(indexed,2,'','','...',15) as snippet FROM indexed WHERE indexed MATCH ? ORDER BY bm25(indexed,2,2,1)"; + $params = [$_GET['q']]; $stmt = $db->prepare($sql); + + set_error_handler(function ($_,$msg) {echo '
'.substr($msg,65).'. you may want to view this documentation on writing valid queries.
';}, E_WARNING); $stmt->execute($params); - - - $rows = array(); - $scores = array(); - while ($row = $stmt->fetch()) { - $score = 0; - foreach ($terms as $param) - $score = $score + 100*(substr_count(strtolower($row['content']),strtolower($param)) / strlen($row['content'])); - $score = $score + 5000*(substr_count(strtolower($row['url']),strtolower($param)) / strlen($row['url'])); - $score = $score + 3000*(substr_count(strtolower($row['title']),strtolower($param)) / strlen($row['title'])); - array_push($scores, $score); - $row['score'] = $score; - array_push($rows, $row); - } - array_multisort($scores, SORT_DESC, $rows); + restore_error_handler(); $results = false; - foreach ($rows as $row) { + while ($row = $stmt->fetch()) { $results = true; ?>
- +
-(score: ) +
-...'.htmlspecialchars(htmlspecialchars_decode($param)).''; - echo htmlspecialchars(htmlspecialchars_decode(substr($content,$pos+strlen($param),50))).'...'; - } - } - +

welcome to searpl

-i am an open source search +i am a simple, open source search engine that can find stuff :3
-normal words inputted will be tags, a -tag will blacklist the tag and -there is also unsorted SQL LIKE syntax. -
-more stuff like site: coming soon! +queries use FTS syntax.
i have query('SELECT id FROM indexed ORDER BY id DESC LIMIT 1')->fetchColumn(); +echo $db->query('SELECT rowid FROM indexed ORDER BY rowid DESC LIMIT 1')->fetchColumn(); ?> pages indexed, using &1 | tee -a wg grep '^--' wg | awk '{ print $3 }' \ - | grep -v '\.\(css\|js\|png\|gif\|jpg\|txt\|ico\|ttf\|svg\)$' \ + | grep -v '\.\(css\|js\|png\|gif\|jpg\|txt\|ico\|ttf\|svg\|rss\|atom\)$' \ | sort | uniq \ | tee -a ur