mirror of
https://github.com/reactos/reactos.git
synced 2024-11-10 00:34:39 +00:00
9dab4509fa
svn path=/trunk/; revision=13064
1131 lines
33 KiB
C#
1131 lines
33 KiB
C#
using System;
|
|
using System.Data;
|
|
using System.Diagnostics;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using System.IO;
|
|
using System.Collections;
|
|
using System.Globalization;
|
|
|
|
namespace HtmlHelp.ChmDecoding
|
|
{
|
|
/// <summary>
|
|
/// The class <c>FullTextSearcher</c> implements a fulltext searcher for a single chm file !
|
|
/// </summary>
|
|
internal sealed class FullTextEngine : IDisposable
|
|
{
|
|
#region Internal helper classes
|
|
/// <summary>
|
|
/// Internal class for decoding the header
|
|
/// </summary>
|
|
private sealed class FTHeader
|
|
{
|
|
/// <summary>
|
|
/// Internal member storing the number of indexed files
|
|
/// </summary>
|
|
private int _numberOfIndexFiles = 0;
|
|
/// <summary>
|
|
/// Internal member storing the offset of the root node
|
|
/// </summary>
|
|
private int _rootOffset = 0;
|
|
/// <summary>
|
|
/// Internal member storing the index-page count
|
|
/// </summary>
|
|
private int _pageCount = 0;
|
|
/// <summary>
|
|
/// Internal member storing the depth of the tree
|
|
/// </summary>
|
|
private int _depth = 0;
|
|
/// <summary>
|
|
/// Internal member storing the scale param for document index en-/decoding
|
|
/// </summary>
|
|
private byte _scaleDocIdx = 0;
|
|
/// <summary>
|
|
/// Internal member storing the scale param for code-count en-/decoding
|
|
/// </summary>
|
|
private byte _scaleCodeCnt = 0;
|
|
/// <summary>
|
|
/// Internal member storing the scale param for location codes en-/decoding
|
|
/// </summary>
|
|
private byte _scaleLocCodes = 0;
|
|
/// <summary>
|
|
/// Internal member storing the root param for document index en-/decoding
|
|
/// </summary>
|
|
private byte _rootDocIdx = 0;
|
|
/// <summary>
|
|
/// Internal member storing the root param for code-count en-/decoding
|
|
/// </summary>
|
|
private byte _rootCodeCnt = 0;
|
|
/// <summary>
|
|
/// Internal member storing the root param for location codes en-/decoding
|
|
/// </summary>
|
|
private byte _rootLocCodes = 0;
|
|
/// <summary>
|
|
/// Internal member storing the size of the nodes in bytes
|
|
/// </summary>
|
|
private int _nodeSize = 0;
|
|
/// <summary>
|
|
/// Internal member storing the length of the longest word
|
|
/// </summary>
|
|
private int _lengthOfLongestWord = 0;
|
|
/// <summary>
|
|
/// Internal member storing the total number of words
|
|
/// </summary>
|
|
private int _totalNumberOfWords = 0;
|
|
/// <summary>
|
|
/// Internal member storing the total number of unique words
|
|
/// </summary>
|
|
private int _numberOfUniqueWords = 0;
|
|
/// <summary>
|
|
/// Internal member storing the codepage identifier
|
|
/// </summary>
|
|
private int _codePage = 1252;
|
|
/// <summary>
|
|
/// Internal member storing the language code id
|
|
/// </summary>
|
|
private int _lcid = 1033;
|
|
/// <summary>
|
|
/// Internal member storing the text encoder
|
|
/// </summary>
|
|
private Encoding _textEncoder = Encoding.Default;
|
|
|
|
/// <summary>
|
|
/// Constructor of the header
|
|
/// </summary>
|
|
/// <param name="binaryData">binary data from which the header will be extracted</param>
|
|
public FTHeader(byte[] binaryData)
|
|
{
|
|
DecodeHeader(binaryData);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Internal constructor for reading from dump
|
|
/// </summary>
|
|
internal FTHeader()
|
|
{
|
|
}
|
|
|
|
/// <summary>
|
|
/// Decodes the binary header information and fills the members
|
|
/// </summary>
|
|
/// <param name="binaryData">binary data from which the header will be extracted</param>
|
|
private void DecodeHeader(byte[] binaryData)
|
|
{
|
|
MemoryStream memStream = new MemoryStream(binaryData);
|
|
BinaryReader binReader = new BinaryReader(memStream);
|
|
|
|
binReader.ReadBytes(4); // 4 unknown bytes
|
|
|
|
_numberOfIndexFiles = binReader.ReadInt32(); // number of indexed files
|
|
|
|
binReader.ReadInt32(); // unknown
|
|
binReader.ReadInt32(); // unknown
|
|
|
|
_pageCount = binReader.ReadInt32(); // page-count
|
|
_rootOffset = binReader.ReadInt32(); // file offset of the root node
|
|
_depth = binReader.ReadInt16(); // depth of the tree
|
|
|
|
binReader.ReadInt32(); // unknown
|
|
|
|
_scaleDocIdx = binReader.ReadByte();
|
|
_rootDocIdx = binReader.ReadByte();
|
|
_scaleCodeCnt = binReader.ReadByte();
|
|
_rootCodeCnt = binReader.ReadByte();
|
|
_scaleLocCodes = binReader.ReadByte();
|
|
_rootLocCodes = binReader.ReadByte();
|
|
|
|
if( (_scaleDocIdx != 2) || ( _scaleCodeCnt != 2 ) || ( _scaleLocCodes != 2 ) )
|
|
{
|
|
Debug.WriteLine("Unsupported scale for s/r encoding !");
|
|
throw new InvalidOperationException("Unsupported scale for s/r encoding !");
|
|
}
|
|
|
|
binReader.ReadBytes(10); // unknown
|
|
|
|
_nodeSize = binReader.ReadInt32();
|
|
|
|
binReader.ReadInt32(); // unknown
|
|
binReader.ReadInt32(); // not important
|
|
binReader.ReadInt32(); // not important
|
|
|
|
_lengthOfLongestWord = binReader.ReadInt32();
|
|
_totalNumberOfWords = binReader.ReadInt32();
|
|
_numberOfUniqueWords = binReader.ReadInt32();
|
|
|
|
binReader.ReadInt32(); // not important
|
|
binReader.ReadInt32(); // not important
|
|
binReader.ReadInt32(); // not important
|
|
binReader.ReadInt32(); // not important
|
|
binReader.ReadInt32(); // not important
|
|
binReader.ReadInt32(); // not important
|
|
|
|
binReader.ReadBytes(24); // not important
|
|
|
|
_codePage = binReader.ReadInt32();
|
|
_lcid = binReader.ReadInt32();
|
|
|
|
CultureInfo ci = new CultureInfo(_lcid);
|
|
_textEncoder = Encoding.GetEncoding( ci.TextInfo.ANSICodePage );
|
|
|
|
// rest of header is not important for us
|
|
}
|
|
|
|
/// <summary>
|
|
/// Dump the class data to a binary writer
|
|
/// </summary>
|
|
/// <param name="writer">writer to write the data</param>
|
|
internal void Dump(ref BinaryWriter writer)
|
|
{
|
|
writer.Write( _numberOfIndexFiles );
|
|
writer.Write( _rootOffset );
|
|
writer.Write( _pageCount );
|
|
writer.Write( _depth );
|
|
writer.Write( _scaleDocIdx );
|
|
writer.Write( _rootDocIdx );
|
|
writer.Write( _scaleCodeCnt );
|
|
writer.Write( _rootCodeCnt );
|
|
writer.Write( _scaleLocCodes );
|
|
writer.Write( _rootLocCodes );
|
|
writer.Write( _nodeSize );
|
|
writer.Write( _lengthOfLongestWord );
|
|
writer.Write( _totalNumberOfWords );
|
|
writer.Write( _numberOfUniqueWords );
|
|
}
|
|
|
|
/// <summary>
|
|
/// Reads the object data from a dump store
|
|
/// </summary>
|
|
/// <param name="reader">reader to read the data</param>
|
|
internal void ReadDump(ref BinaryReader reader)
|
|
{
|
|
_numberOfIndexFiles = reader.ReadInt32();
|
|
_rootOffset = reader.ReadInt32();
|
|
_pageCount = reader.ReadInt32();
|
|
_depth = reader.ReadInt32();
|
|
|
|
_scaleDocIdx = reader.ReadByte();
|
|
_rootDocIdx = reader.ReadByte();
|
|
_scaleCodeCnt = reader.ReadByte();
|
|
_rootCodeCnt = reader.ReadByte();
|
|
_scaleLocCodes = reader.ReadByte();
|
|
_rootLocCodes = reader.ReadByte();
|
|
|
|
_nodeSize = reader.ReadInt32();
|
|
_lengthOfLongestWord = reader.ReadInt32();
|
|
_totalNumberOfWords = reader.ReadInt32();
|
|
_numberOfUniqueWords = reader.ReadInt32();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the number of indexed files
|
|
/// </summary>
|
|
public int IndexedFileCount
|
|
{
|
|
get { return _numberOfIndexFiles; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the file offset of the root node
|
|
/// </summary>
|
|
public int RootOffset
|
|
{
|
|
get { return _rootOffset; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the page count
|
|
/// </summary>
|
|
public int PageCount
|
|
{
|
|
get { return _pageCount; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the index depth
|
|
/// </summary>
|
|
public int Depth
|
|
{
|
|
get { return _depth; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the scale param for document index en-/decoding
|
|
/// </summary>
|
|
/// <remarks>The scale and root method of integer encoding needs two parameters,
|
|
/// which I'll call s (scale) and r (root size).
|
|
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
|
|
/// p determines how many bits are stored, as well as implicitly determining
|
|
/// the high-order bit of the integer. </remarks>
|
|
public byte ScaleDocumentIndex
|
|
{
|
|
get { return _scaleDocIdx; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the root param for the document index en-/decoding
|
|
/// </summary>
|
|
/// <remarks>The scale and root method of integer encoding needs two parameters,
|
|
/// which I'll call s (scale) and r (root size).
|
|
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
|
|
/// p determines how many bits are stored, as well as implicitly determining
|
|
/// the high-order bit of the integer. </remarks>
|
|
public byte RootDocumentIndex
|
|
{
|
|
get { return _rootDocIdx; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the scale param for the code-count en-/decoding
|
|
/// </summary>
|
|
/// <remarks>The scale and root method of integer encoding needs two parameters,
|
|
/// which I'll call s (scale) and r (root size).
|
|
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
|
|
/// p determines how many bits are stored, as well as implicitly determining
|
|
/// the high-order bit of the integer. </remarks>
|
|
public byte ScaleCodeCount
|
|
{
|
|
get { return _scaleCodeCnt; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the root param for the code-count en-/decoding
|
|
/// </summary>
|
|
/// <remarks>The scale and root method of integer encoding needs two parameters,
|
|
/// which I'll call s (scale) and r (root size).
|
|
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
|
|
/// p determines how many bits are stored, as well as implicitly determining
|
|
/// the high-order bit of the integer. </remarks>
|
|
public byte RootCodeCount
|
|
{
|
|
get { return _rootCodeCnt; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the scale param for the location codes en-/decoding
|
|
/// </summary>
|
|
/// <remarks>The scale and root method of integer encoding needs two parameters,
|
|
/// which I'll call s (scale) and r (root size).
|
|
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
|
|
/// p determines how many bits are stored, as well as implicitly determining
|
|
/// the high-order bit of the integer. </remarks>
|
|
public byte ScaleLocationCodes
|
|
{
|
|
get { return _scaleLocCodes; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the root param for the location codes en-/decoding
|
|
/// </summary>
|
|
/// <remarks>The scale and root method of integer encoding needs two parameters,
|
|
/// which I'll call s (scale) and r (root size).
|
|
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
|
|
/// p determines how many bits are stored, as well as implicitly determining
|
|
/// the high-order bit of the integer. </remarks>
|
|
public byte RootLocationCodes
|
|
{
|
|
get { return _rootLocCodes; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the size in bytes of each index/leaf node
|
|
/// </summary>
|
|
public int NodeSize
|
|
{
|
|
get { return _nodeSize; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the length of the longest word in the index
|
|
/// </summary>
|
|
private int LengthOfLongestWord
|
|
{
|
|
get { return _lengthOfLongestWord; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the total number of words indexed (including duplicates)
|
|
/// </summary>
|
|
public int TotalWordCount
|
|
{
|
|
get { return _totalNumberOfWords; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the total number of unique words indexed (excluding duplicates)
|
|
/// </summary>
|
|
public int UniqueWordCount
|
|
{
|
|
get { return _numberOfUniqueWords; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the codepage identifier
|
|
/// </summary>
|
|
public int CodePage
|
|
{
|
|
get { return _codePage; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the language code id
|
|
/// </summary>
|
|
public int LCID
|
|
{
|
|
get { return _lcid; }
|
|
}
|
|
|
|
public Encoding TextEncoder
|
|
{
|
|
get
|
|
{
|
|
return _textEncoder;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/// <summary>
|
|
/// Internal class for easier hit recording and rate-calculation
|
|
/// </summary>
|
|
private sealed class HitHelper : IComparable
|
|
{
|
|
/// <summary>
|
|
/// Internal member storing the associated document index
|
|
/// </summary>
|
|
private int _documentIndex = 0;
|
|
/// <summary>
|
|
/// Internal member storing the title
|
|
/// </summary>
|
|
private string _title = "";
|
|
/// <summary>
|
|
/// Internal member storing the locale
|
|
/// </summary>
|
|
private string _locale = "";
|
|
/// <summary>
|
|
/// Internal member storing the location
|
|
/// </summary>
|
|
private string _location = "";
|
|
/// <summary>
|
|
/// Internal member storing the url
|
|
/// </summary>
|
|
private string _url = "";
|
|
/// <summary>
|
|
/// Internal member storing the rating
|
|
/// </summary>
|
|
private double _rating = 0;
|
|
/// <summary>
|
|
/// Internal member used for rating calculation
|
|
/// </summary>
|
|
private Hashtable _partialRating = new Hashtable();
|
|
|
|
/// <summary>
|
|
/// Constructor of the class
|
|
/// </summary>
|
|
/// <param name="documentIndex">document index</param>
|
|
/// <param name="title">title</param>
|
|
/// <param name="locale">locale parameter</param>
|
|
/// <param name="location">location</param>
|
|
/// <param name="url">url of document</param>
|
|
/// <param name="rating">rating</param>
|
|
public HitHelper(int documentIndex, string title, string locale, string location, string url, double rating)
|
|
{
|
|
_documentIndex = documentIndex;
|
|
_title = title;
|
|
_locale = locale;
|
|
_location = location;
|
|
_url = url;
|
|
_rating = rating;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Updates the rating for a found word
|
|
/// </summary>
|
|
/// <param name="word">word found</param>
|
|
public void UpdateRating(string word)
|
|
{
|
|
if( _partialRating[word] == null)
|
|
{
|
|
_partialRating[word] = 100.0;
|
|
}
|
|
else
|
|
{
|
|
_partialRating[word] = ((double)_partialRating[word])*1.01;
|
|
}
|
|
|
|
_rating = 0.0;
|
|
|
|
foreach(double val in _partialRating.Values)
|
|
{
|
|
_rating += val;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Implements the CompareTo method of the IComparable interface.
|
|
/// Allows an easy sort by the document rating
|
|
/// </summary>
|
|
/// <param name="obj">object to compare</param>
|
|
/// <returns>0 ... equal, -1 ... this instance is less than obj, 1 ... this instance is greater than obj</returns>
|
|
public int CompareTo(object obj)
|
|
{
|
|
if( obj is HitHelper )
|
|
{
|
|
HitHelper hObj = (HitHelper)obj;
|
|
|
|
return this.Rating.CompareTo( hObj.Rating );
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the internal hashtable used for counting word hits of the document
|
|
/// </summary>
|
|
internal Hashtable PartialRating
|
|
{
|
|
get { return _partialRating; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the document index of the hit helper instance
|
|
/// </summary>
|
|
public int DocumentIndex
|
|
{
|
|
get { return _documentIndex; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the title
|
|
/// </summary>
|
|
public string Title
|
|
{
|
|
get { return _title; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the locale
|
|
/// </summary>
|
|
public string Locale
|
|
{
|
|
get { return _locale; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the location
|
|
/// </summary>
|
|
public string Location
|
|
{
|
|
get { return _location; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the url
|
|
/// </summary>
|
|
public string URL
|
|
{
|
|
get { return _url; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the rating
|
|
/// </summary>
|
|
public double Rating
|
|
{
|
|
get { return _rating; }
|
|
}
|
|
|
|
}
|
|
|
|
#endregion
|
|
|
|
/// <summary>
|
|
/// Regular expression getting the text between to quotes
|
|
/// </summary>
|
|
private string RE_Quotes = @"\""(?<innerText>.*?)\""";
|
|
/// <summary>
|
|
/// Internal flag specifying if the object is going to be disposed
|
|
/// </summary>
|
|
private bool disposed = false;
|
|
/// <summary>
|
|
/// Internal member storing the binary file data
|
|
/// </summary>
|
|
private byte[] _binaryFileData = null;
|
|
/// <summary>
|
|
/// Internal datatable storing the search hits
|
|
/// </summary>
|
|
private DataTable _hits =null;
|
|
/// <summary>
|
|
/// Internal arraylist for hit management
|
|
/// </summary>
|
|
private ArrayList _hitsHelper = new ArrayList();
|
|
/// <summary>
|
|
/// Internal member storing the header of the file
|
|
/// </summary>
|
|
private FTHeader _header = null;
|
|
/// <summary>
|
|
/// Internal member storing the associated chmfile object
|
|
/// </summary>
|
|
private CHMFile _associatedFile = null;
|
|
|
|
/// <summary>
|
|
/// Constructor of the class
|
|
/// </summary>
|
|
/// <param name="binaryFileData">binary file data of the $FIftiMain file</param>
|
|
/// <param name="associatedFile">associated chm file</param>
|
|
public FullTextEngine(byte[] binaryFileData, CHMFile associatedFile)
|
|
{
|
|
_binaryFileData = binaryFileData;
|
|
_associatedFile = associatedFile;
|
|
|
|
if(_associatedFile.SystemFile.FullTextSearch)
|
|
{
|
|
_header = new FTHeader(_binaryFileData); // reading header
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Standard constructor
|
|
/// </summary>
|
|
internal FullTextEngine()
|
|
{
|
|
}
|
|
|
|
#region Data dumping
|
|
/// <summary>
|
|
/// Dump the class data to a binary writer
|
|
/// </summary>
|
|
/// <param name="writer">writer to write the data</param>
|
|
internal void Dump(ref BinaryWriter writer)
|
|
{
|
|
_header.Dump(ref writer);
|
|
writer.Write( _binaryFileData.Length );
|
|
writer.Write(_binaryFileData);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Reads the object data from a dump store
|
|
/// </summary>
|
|
/// <param name="reader">reader to read the data</param>
|
|
internal void ReadDump(ref BinaryReader reader)
|
|
{
|
|
_header = new FTHeader();
|
|
_header.ReadDump(ref reader);
|
|
|
|
int nCnt = reader.ReadInt32();
|
|
_binaryFileData = reader.ReadBytes(nCnt);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Sets the associated CHMFile instance
|
|
/// </summary>
|
|
/// <param name="associatedFile">instance to set</param>
|
|
internal void SetCHMFile(CHMFile associatedFile)
|
|
{
|
|
_associatedFile = associatedFile;
|
|
}
|
|
#endregion
|
|
|
|
/// <summary>
|
|
/// Gets a flag if full-text searching is available for this chm file.
|
|
/// </summary>
|
|
public bool CanSearch
|
|
{
|
|
get { return (_associatedFile.SystemFile.FullTextSearch && (_header != null) ); }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Performs a fulltext search of a single file.
|
|
/// </summary>
|
|
/// <param name="search">word(s) or phrase to search</param>
|
|
/// <param name="partialMatches">true if partial word should be matched also
|
|
/// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
|
|
/// <param name="titleOnly">true if only search in titles</param>
|
|
/// <remarks>Hits are available through the <see cref="Hits">Hists property</see>.</remarks>
|
|
public bool Search(string search, bool partialMatches, bool titleOnly)
|
|
{
|
|
return Search(search, -1, partialMatches, titleOnly);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Performs a fulltext search of a single file.
|
|
/// </summary>
|
|
/// <param name="search">word(s) or phrase to search</param>
|
|
/// <param name="MaxHits">max hits. If this number is reached, the search will be interrupted</param>
|
|
/// <param name="partialMatches">true if partial word should be matched also
|
|
/// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
|
|
/// <param name="titleOnly">true if only search in titles</param>
|
|
/// <remarks>Hits are available through the <see cref="Hits">Hists property</see>.</remarks>
|
|
public bool Search(string search, int MaxHits, bool partialMatches, bool titleOnly)
|
|
{
|
|
if(CanSearch)
|
|
{
|
|
string searchString = search;
|
|
|
|
// Check if this is a quoted string
|
|
bool IsQuoted = (search.IndexOf("\"")>-1);
|
|
|
|
if(IsQuoted)
|
|
searchString = search.Replace("\"",""); // remove the quotes during search
|
|
|
|
bool bRet = true;
|
|
|
|
_hitsHelper = null;
|
|
_hitsHelper = new ArrayList();
|
|
|
|
_hits = null;
|
|
CreateHitsTable();
|
|
|
|
string[] words = searchString.Split(new char[] {' '});
|
|
|
|
for(int i=0; i<words.Length; i++)
|
|
{
|
|
bRet &= SearchSingleWord(words[i], MaxHits, partialMatches, titleOnly);
|
|
if(_hitsHelper.Count >= MaxHits)
|
|
break;
|
|
}
|
|
|
|
if(bRet && IsQuoted)
|
|
{
|
|
FinalizeQuoted(search);
|
|
}
|
|
|
|
if(bRet)
|
|
{
|
|
_hitsHelper.Sort();
|
|
|
|
int nhCount = MaxHits;
|
|
|
|
if( MaxHits < 0)
|
|
{
|
|
nhCount = _hitsHelper.Count;
|
|
}
|
|
|
|
if( nhCount > _hitsHelper.Count )
|
|
nhCount = _hitsHelper.Count;
|
|
|
|
// create hits datatable
|
|
for(int i=nhCount; i > 0; i--)
|
|
{
|
|
HitHelper curHlp = (HitHelper)(_hitsHelper[i-1]);
|
|
|
|
DataRow newRow = _hits.NewRow();
|
|
|
|
newRow["Rating"] = curHlp.Rating;
|
|
newRow["Title"] = curHlp.Title;
|
|
newRow["Locale"] = curHlp.Locale;
|
|
newRow["Location"] = curHlp.Location;
|
|
newRow["URL"] = curHlp.URL;
|
|
|
|
_hits.Rows.Add( newRow );
|
|
}
|
|
}
|
|
return bRet;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets rid of all search hits which doesn't match the quoted phrase
|
|
/// </summary>
|
|
/// <param name="search">full search string entered by the user</param>
|
|
/// <remarks>Phrase search is not possible using the internal full-text index. We're just filtering all
|
|
/// documents which don't contain all words of the phrase.</remarks>
|
|
private void FinalizeQuoted(string search)
|
|
{
|
|
Regex quoteRE = new Regex(RE_Quotes, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
|
int innerTextIdx = quoteRE.GroupNumberFromName("innerText");
|
|
int nIndex = 0;
|
|
|
|
// get all phrases
|
|
while( quoteRE.IsMatch(search, nIndex) )
|
|
{
|
|
Match m = quoteRE.Match(search, nIndex);
|
|
|
|
string phrase = m.Groups["innerText"].Value;
|
|
|
|
string[] wordsInPhrase = phrase.Split( new char[] {' '} );
|
|
int nCnt = _hitsHelper.Count;
|
|
|
|
for(int i=0; i < _hitsHelper.Count; i++)
|
|
{
|
|
if( ! CheckHit( ((HitHelper)(_hitsHelper[i])), wordsInPhrase) )
|
|
_hitsHelper.RemoveAt(i--);
|
|
}
|
|
|
|
nIndex = m.Index+m.Length;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Eliminates all search hits where not all of the words have been found
|
|
/// </summary>
|
|
/// <param name="hit">hithelper instance to check</param>
|
|
/// <param name="wordsInPhrase">word list</param>
|
|
private bool CheckHit(HitHelper hit, string[] wordsInPhrase)
|
|
{
|
|
|
|
for(int i=0; i<wordsInPhrase.Length;i++)
|
|
{
|
|
if( (hit.PartialRating[wordsInPhrase[i]] == null) || (((double)(hit.PartialRating[wordsInPhrase[i]])) == 0.0) )
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Performs a search for a single word in the index
|
|
/// </summary>
|
|
/// <param name="word">word to search</param>
|
|
/// <param name="MaxHits">maximal hits to return</param>
|
|
/// <param name="partialMatches">true if partial word should be matched also
|
|
/// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
|
|
/// <param name="titleOnly">true if only search in titles</param>
|
|
/// <returns>Returns true if succeeded</returns>
|
|
private bool SearchSingleWord(string word,int MaxHits, bool partialMatches, bool titleOnly)
|
|
{
|
|
string wordLower = word.ToLower();
|
|
|
|
MemoryStream memStream = new MemoryStream(_binaryFileData);
|
|
BinaryReader binReader = new BinaryReader(memStream);
|
|
|
|
// seek to root node
|
|
binReader.BaseStream.Seek( _header.RootOffset, SeekOrigin.Begin );
|
|
|
|
if( _header.Depth > 2 )
|
|
{
|
|
// unsupported index depth
|
|
Debug.WriteLine("FullTextSearcher.SearchSingleWord() - Failed with message: Unsupported index depth !");
|
|
Debug.WriteLine("File: " + _associatedFile.ChmFilePath);
|
|
Debug.WriteLine(" ");
|
|
return false;
|
|
}
|
|
|
|
if( _header.Depth > 1 )
|
|
{
|
|
// seek to the right leaf node ( if depth == 1, we are at the leaf node)
|
|
int freeSpace = binReader.ReadInt16();
|
|
|
|
for(int i=0; i < _header.PageCount; ++i)
|
|
{
|
|
// exstract index entries
|
|
int nWLength = (int)binReader.ReadByte();
|
|
int nCPosition = (int)binReader.ReadByte();
|
|
|
|
string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength-1, 0, true, _header.TextEncoder);
|
|
|
|
int nLeafOffset = binReader.ReadInt32();
|
|
binReader.ReadInt16(); // unknown
|
|
|
|
if( sName.CompareTo(wordLower) >= 0)
|
|
{
|
|
// store current position
|
|
long curPos = binReader.BaseStream.Position;
|
|
|
|
// seek to leaf offset
|
|
binReader.BaseStream.Seek( nLeafOffset, SeekOrigin.Begin );
|
|
|
|
// read leafnode
|
|
ReadLeafNode(ref binReader, word, MaxHits, partialMatches, titleOnly);
|
|
|
|
// return to current position and continue reading index nodes
|
|
binReader.BaseStream.Seek( curPos, SeekOrigin.Begin );
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Reads a leaf node and extracts documents which holds the searched word
|
|
/// </summary>
|
|
/// <param name="binReader">reference to the reader</param>
|
|
/// <param name="word">word to search</param>
|
|
/// <param name="MaxHits">maximal hits to return</param>
|
|
/// <param name="partialMatches">true if partial word should be matched also
|
|
/// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
|
|
/// <param name="titleOnly">true if only search in titles</param>
|
|
private void ReadLeafNode(ref BinaryReader binReader, string word, int MaxHits, bool partialMatches, bool titleOnly)
|
|
{
|
|
int nNextPageOffset = binReader.ReadInt32();
|
|
binReader.ReadInt16(); // unknown
|
|
int lfreeSpace = binReader.ReadInt16();
|
|
string curFullWord = "";
|
|
bool bFound = false;
|
|
string wordLower = word.ToLower();
|
|
|
|
for(;;)
|
|
{
|
|
if(binReader.BaseStream.Position >= binReader.BaseStream.Length)
|
|
break;
|
|
|
|
int nWLength = (int)binReader.ReadByte();
|
|
|
|
if(nWLength == 0)
|
|
break;
|
|
|
|
int nCPosition = (int)binReader.ReadByte();
|
|
|
|
string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength-1, 0, true, _header.TextEncoder);
|
|
|
|
int Context = (int)binReader.ReadByte(); // 0...body tag, 1...title tag, others unknown
|
|
|
|
long nrOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader);
|
|
int wclOffset = binReader.ReadInt32();
|
|
|
|
binReader.ReadInt16(); // unknown
|
|
|
|
long bytesOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader);
|
|
|
|
if( nCPosition > 0)
|
|
{
|
|
curFullWord = CombineStrings(curFullWord, sName, nCPosition);
|
|
}
|
|
else
|
|
{
|
|
curFullWord = sName;
|
|
}
|
|
|
|
bFound = false;
|
|
if(partialMatches)
|
|
bFound = ( curFullWord.IndexOf(wordLower) >= 0 );
|
|
else
|
|
bFound = (curFullWord == wordLower);
|
|
|
|
if( bFound )
|
|
{
|
|
if( (titleOnly && (Context==1)) || (!titleOnly) )
|
|
{
|
|
// store actual offset
|
|
long curPos = binReader.BaseStream.Position;
|
|
|
|
// found the word, begin with WCL encoding
|
|
binReader.BaseStream.Seek(wclOffset, SeekOrigin.Begin );
|
|
|
|
byte[] wclBytes = binReader.ReadBytes((int)bytesOfWCL);
|
|
|
|
DecodeWCL(wclBytes, MaxHits, word);
|
|
|
|
// back and continue reading leafnodes
|
|
binReader.BaseStream.Seek(curPos, SeekOrigin.Begin );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Decodes the s/r encoded WordCodeList (=wcl) and creates hit entries
|
|
/// </summary>
|
|
/// <param name="wclBytes">wcl encoded byte array</param>
|
|
/// <param name="MaxHits">maximal hits</param>
|
|
/// <param name="word">the word to find</param>
|
|
private void DecodeWCL(byte[] wclBytes,int MaxHits, string word)
|
|
{
|
|
byte[] wclBits = new byte[ wclBytes.Length*8 ];
|
|
|
|
int nBitIdx=0;
|
|
|
|
for(int i=0; i<wclBytes.Length; i++)
|
|
{
|
|
for(int j=0; j<8; j++)
|
|
{
|
|
wclBits[nBitIdx] = ((byte)(wclBytes[i] & ((byte)( (byte)0x1 << (7-j) )))) > (byte)0 ? (byte)1 : (byte)0;
|
|
nBitIdx++;
|
|
}
|
|
}
|
|
|
|
nBitIdx = 0;
|
|
|
|
int nDocIdx = 0; // delta encoded
|
|
|
|
while(nBitIdx < wclBits.Length)
|
|
{
|
|
nDocIdx += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleDocumentIndex, _header.RootDocumentIndex, ref nBitIdx);
|
|
int nCodeCnt = BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleCodeCount, _header.RootCodeCount, ref nBitIdx);
|
|
|
|
int nWordLocation = 0; // delta encoded
|
|
|
|
for(int locidx=0; locidx<nCodeCnt; locidx++)
|
|
{
|
|
nWordLocation += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleLocationCodes, _header.RootLocationCodes, ref nBitIdx);
|
|
}
|
|
// apply padding
|
|
while( (nBitIdx % 8) != 0)
|
|
nBitIdx++;
|
|
|
|
// Record hit
|
|
HitHelper hitObj = DocumentHit(nDocIdx);
|
|
|
|
if(hitObj == null)
|
|
{
|
|
if(_hitsHelper.Count > MaxHits)
|
|
return;
|
|
|
|
hitObj = new HitHelper(nDocIdx, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Title,
|
|
((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Locale, _associatedFile.CompileFile,
|
|
((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).URL, 0.0);
|
|
|
|
for(int k=0;k<nCodeCnt;k++)
|
|
hitObj.UpdateRating(word);
|
|
|
|
_hitsHelper.Add(hitObj);
|
|
}
|
|
else
|
|
{
|
|
for(int k=0;k<nCodeCnt;k++)
|
|
hitObj.UpdateRating(word);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Combines a "master" word with a partial word.
|
|
/// </summary>
|
|
/// <param name="word">the master word</param>
|
|
/// <param name="partial">the partial word</param>
|
|
/// <param name="partialPosition">position to place the parial word</param>
|
|
/// <returns>returns a combined string</returns>
|
|
private string CombineStrings(string word, string partial, int partialPosition)
|
|
{
|
|
string sCombined = word;
|
|
int i=0;
|
|
|
|
for(i=0; i<partial.Length; i++)
|
|
{
|
|
if( (i+partialPosition) > (sCombined.Length-1) )
|
|
{
|
|
sCombined += partial[i];
|
|
}
|
|
else
|
|
{
|
|
StringBuilder sb = new StringBuilder(sCombined);
|
|
|
|
sb.Replace( sCombined[partialPosition+i], partial[i], partialPosition+i, 1);
|
|
sCombined = sb.ToString();
|
|
}
|
|
}
|
|
|
|
if(! ((i+partialPosition) > (sCombined.Length-1)) )
|
|
{
|
|
sCombined = sCombined.Substring(0, partialPosition+partial.Length);
|
|
}
|
|
|
|
return sCombined;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the HitHelper instance for a specific document index
|
|
/// </summary>
|
|
/// <param name="index">document index</param>
|
|
/// <returns>The reference of the hithelper instance for this document index, otherwise null</returns>
|
|
private HitHelper DocumentHit(int index)
|
|
{
|
|
foreach(HitHelper curObj in _hitsHelper)
|
|
{
|
|
if( curObj.DocumentIndex == index)
|
|
return curObj;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates a DataTable for storing the hits
|
|
/// </summary>
|
|
private void CreateHitsTable()
|
|
{
|
|
_hits = new DataTable("FT_Search_Hits");
|
|
|
|
DataColumn ftColumn;
|
|
|
|
ftColumn = new DataColumn();
|
|
ftColumn.DataType = System.Type.GetType("System.Double");
|
|
ftColumn.ColumnName = "Rating";
|
|
ftColumn.ReadOnly = false;
|
|
ftColumn.Unique = false;
|
|
|
|
_hits.Columns.Add(ftColumn);
|
|
|
|
ftColumn = new DataColumn();
|
|
ftColumn.DataType = System.Type.GetType("System.String");
|
|
ftColumn.ColumnName = "Title";
|
|
ftColumn.ReadOnly = false;
|
|
ftColumn.Unique = false;
|
|
|
|
_hits.Columns.Add(ftColumn);
|
|
|
|
ftColumn = new DataColumn();
|
|
ftColumn.DataType = System.Type.GetType("System.String");
|
|
ftColumn.ColumnName = "Locale";
|
|
ftColumn.ReadOnly = false;
|
|
ftColumn.Unique = false;
|
|
|
|
_hits.Columns.Add(ftColumn);
|
|
|
|
ftColumn = new DataColumn();
|
|
ftColumn.DataType = System.Type.GetType("System.String");
|
|
ftColumn.ColumnName = "Location";
|
|
ftColumn.ReadOnly = false;
|
|
ftColumn.Unique = false;
|
|
|
|
_hits.Columns.Add(ftColumn);
|
|
|
|
ftColumn = new DataColumn();
|
|
ftColumn.DataType = System.Type.GetType("System.String");
|
|
ftColumn.ColumnName = "URL";
|
|
ftColumn.ReadOnly = false;
|
|
ftColumn.Unique = false;
|
|
|
|
_hits.Columns.Add(ftColumn);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets an datatable containing the hits of the last search
|
|
/// </summary>
|
|
public DataTable Hits
|
|
{
|
|
get { return _hits; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Implement IDisposable.
|
|
/// </summary>
|
|
public void Dispose()
|
|
{
|
|
Dispose(true);
|
|
// This object will be cleaned up by the Dispose method.
|
|
// Therefore, you should call GC.SupressFinalize to
|
|
// take this object off the finalization queue
|
|
// and prevent finalization code for this object
|
|
// from executing a second time.
|
|
GC.SuppressFinalize(this);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Dispose(bool disposing) executes in two distinct scenarios.
|
|
/// If disposing equals true, the method has been called directly
|
|
/// or indirectly by a user's code. Managed and unmanaged resources
|
|
/// can be disposed.
|
|
/// If disposing equals false, the method has been called by the
|
|
/// runtime from inside the finalizer and you should not reference
|
|
/// other objects. Only unmanaged resources can be disposed.
|
|
/// </summary>
|
|
/// <param name="disposing">disposing flag</param>
|
|
private void Dispose(bool disposing)
|
|
{
|
|
// Check to see if Dispose has already been called.
|
|
if(!this.disposed)
|
|
{
|
|
// If disposing equals true, dispose all managed
|
|
// and unmanaged resources.
|
|
if(disposing)
|
|
{
|
|
// Dispose managed resources.
|
|
_binaryFileData = null;
|
|
}
|
|
}
|
|
disposed = true;
|
|
}
|
|
}
|
|
}
|