reactos/irc/TechBot/CHMLibrary/CHMDecoding/FullTextEngine.cs
Casper Hornstrup 9dab4509fa Import TechBot
svn path=/trunk/; revision=13064
2005-01-15 19:27:25 +00:00

1131 lines
33 KiB
C#

using System;
using System.Data;
using System.Diagnostics;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.Collections;
using System.Globalization;
namespace HtmlHelp.ChmDecoding
{
/// <summary>
/// The class <c>FullTextSearcher</c> implements a fulltext searcher for a single chm file !
/// </summary>
internal sealed class FullTextEngine : IDisposable
{
#region Internal helper classes
/// <summary>
/// Internal class for decoding the header
/// </summary>
private sealed class FTHeader
{
/// <summary>
/// Internal member storing the number of indexed files
/// </summary>
private int _numberOfIndexFiles = 0;
/// <summary>
/// Internal member storing the offset of the root node
/// </summary>
private int _rootOffset = 0;
/// <summary>
/// Internal member storing the index-page count
/// </summary>
private int _pageCount = 0;
/// <summary>
/// Internal member storing the depth of the tree
/// </summary>
private int _depth = 0;
/// <summary>
/// Internal member storing the scale param for document index en-/decoding
/// </summary>
private byte _scaleDocIdx = 0;
/// <summary>
/// Internal member storing the scale param for code-count en-/decoding
/// </summary>
private byte _scaleCodeCnt = 0;
/// <summary>
/// Internal member storing the scale param for location codes en-/decoding
/// </summary>
private byte _scaleLocCodes = 0;
/// <summary>
/// Internal member storing the root param for document index en-/decoding
/// </summary>
private byte _rootDocIdx = 0;
/// <summary>
/// Internal member storing the root param for code-count en-/decoding
/// </summary>
private byte _rootCodeCnt = 0;
/// <summary>
/// Internal member storing the root param for location codes en-/decoding
/// </summary>
private byte _rootLocCodes = 0;
/// <summary>
/// Internal member storing the size of the nodes in bytes
/// </summary>
private int _nodeSize = 0;
/// <summary>
/// Internal member storing the length of the longest word
/// </summary>
private int _lengthOfLongestWord = 0;
/// <summary>
/// Internal member storing the total number of words
/// </summary>
private int _totalNumberOfWords = 0;
/// <summary>
/// Internal member storing the total number of unique words
/// </summary>
private int _numberOfUniqueWords = 0;
/// <summary>
/// Internal member storing the codepage identifier
/// </summary>
private int _codePage = 1252;
/// <summary>
/// Internal member storing the language code id
/// </summary>
private int _lcid = 1033;
/// <summary>
/// Internal member storing the text encoder
/// </summary>
private Encoding _textEncoder = Encoding.Default;
/// <summary>
/// Constructor of the header
/// </summary>
/// <param name="binaryData">binary data from which the header will be extracted</param>
public FTHeader(byte[] binaryData)
{
DecodeHeader(binaryData);
}
/// <summary>
/// Internal constructor for reading from dump
/// </summary>
internal FTHeader()
{
}
/// <summary>
/// Decodes the binary header information and fills the members
/// </summary>
/// <param name="binaryData">binary data from which the header will be extracted</param>
private void DecodeHeader(byte[] binaryData)
{
MemoryStream memStream = new MemoryStream(binaryData);
BinaryReader binReader = new BinaryReader(memStream);
binReader.ReadBytes(4); // 4 unknown bytes
_numberOfIndexFiles = binReader.ReadInt32(); // number of indexed files
binReader.ReadInt32(); // unknown
binReader.ReadInt32(); // unknown
_pageCount = binReader.ReadInt32(); // page-count
_rootOffset = binReader.ReadInt32(); // file offset of the root node
_depth = binReader.ReadInt16(); // depth of the tree
binReader.ReadInt32(); // unknown
_scaleDocIdx = binReader.ReadByte();
_rootDocIdx = binReader.ReadByte();
_scaleCodeCnt = binReader.ReadByte();
_rootCodeCnt = binReader.ReadByte();
_scaleLocCodes = binReader.ReadByte();
_rootLocCodes = binReader.ReadByte();
if( (_scaleDocIdx != 2) || ( _scaleCodeCnt != 2 ) || ( _scaleLocCodes != 2 ) )
{
Debug.WriteLine("Unsupported scale for s/r encoding !");
throw new InvalidOperationException("Unsupported scale for s/r encoding !");
}
binReader.ReadBytes(10); // unknown
_nodeSize = binReader.ReadInt32();
binReader.ReadInt32(); // unknown
binReader.ReadInt32(); // not important
binReader.ReadInt32(); // not important
_lengthOfLongestWord = binReader.ReadInt32();
_totalNumberOfWords = binReader.ReadInt32();
_numberOfUniqueWords = binReader.ReadInt32();
binReader.ReadInt32(); // not important
binReader.ReadInt32(); // not important
binReader.ReadInt32(); // not important
binReader.ReadInt32(); // not important
binReader.ReadInt32(); // not important
binReader.ReadInt32(); // not important
binReader.ReadBytes(24); // not important
_codePage = binReader.ReadInt32();
_lcid = binReader.ReadInt32();
CultureInfo ci = new CultureInfo(_lcid);
_textEncoder = Encoding.GetEncoding( ci.TextInfo.ANSICodePage );
// rest of header is not important for us
}
/// <summary>
/// Dump the class data to a binary writer
/// </summary>
/// <param name="writer">writer to write the data</param>
internal void Dump(ref BinaryWriter writer)
{
writer.Write( _numberOfIndexFiles );
writer.Write( _rootOffset );
writer.Write( _pageCount );
writer.Write( _depth );
writer.Write( _scaleDocIdx );
writer.Write( _rootDocIdx );
writer.Write( _scaleCodeCnt );
writer.Write( _rootCodeCnt );
writer.Write( _scaleLocCodes );
writer.Write( _rootLocCodes );
writer.Write( _nodeSize );
writer.Write( _lengthOfLongestWord );
writer.Write( _totalNumberOfWords );
writer.Write( _numberOfUniqueWords );
}
/// <summary>
/// Reads the object data from a dump store
/// </summary>
/// <param name="reader">reader to read the data</param>
internal void ReadDump(ref BinaryReader reader)
{
_numberOfIndexFiles = reader.ReadInt32();
_rootOffset = reader.ReadInt32();
_pageCount = reader.ReadInt32();
_depth = reader.ReadInt32();
_scaleDocIdx = reader.ReadByte();
_rootDocIdx = reader.ReadByte();
_scaleCodeCnt = reader.ReadByte();
_rootCodeCnt = reader.ReadByte();
_scaleLocCodes = reader.ReadByte();
_rootLocCodes = reader.ReadByte();
_nodeSize = reader.ReadInt32();
_lengthOfLongestWord = reader.ReadInt32();
_totalNumberOfWords = reader.ReadInt32();
_numberOfUniqueWords = reader.ReadInt32();
}
/// <summary>
/// Gets the number of indexed files
/// </summary>
public int IndexedFileCount
{
get { return _numberOfIndexFiles; }
}
/// <summary>
/// Gets the file offset of the root node
/// </summary>
public int RootOffset
{
get { return _rootOffset; }
}
/// <summary>
/// Gets the page count
/// </summary>
public int PageCount
{
get { return _pageCount; }
}
/// <summary>
/// Gets the index depth
/// </summary>
public int Depth
{
get { return _depth; }
}
/// <summary>
/// Gets the scale param for document index en-/decoding
/// </summary>
/// <remarks>The scale and root method of integer encoding needs two parameters,
/// which I'll call s (scale) and r (root size).
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
/// p determines how many bits are stored, as well as implicitly determining
/// the high-order bit of the integer. </remarks>
public byte ScaleDocumentIndex
{
get { return _scaleDocIdx; }
}
/// <summary>
/// Gets the root param for the document index en-/decoding
/// </summary>
/// <remarks>The scale and root method of integer encoding needs two parameters,
/// which I'll call s (scale) and r (root size).
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
/// p determines how many bits are stored, as well as implicitly determining
/// the high-order bit of the integer. </remarks>
public byte RootDocumentIndex
{
get { return _rootDocIdx; }
}
/// <summary>
/// Gets the scale param for the code-count en-/decoding
/// </summary>
/// <remarks>The scale and root method of integer encoding needs two parameters,
/// which I'll call s (scale) and r (root size).
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
/// p determines how many bits are stored, as well as implicitly determining
/// the high-order bit of the integer. </remarks>
public byte ScaleCodeCount
{
get { return _scaleCodeCnt; }
}
/// <summary>
/// Gets the root param for the code-count en-/decoding
/// </summary>
/// <remarks>The scale and root method of integer encoding needs two parameters,
/// which I'll call s (scale) and r (root size).
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
/// p determines how many bits are stored, as well as implicitly determining
/// the high-order bit of the integer. </remarks>
public byte RootCodeCount
{
get { return _rootCodeCnt; }
}
/// <summary>
/// Gets the scale param for the location codes en-/decoding
/// </summary>
/// <remarks>The scale and root method of integer encoding needs two parameters,
/// which I'll call s (scale) and r (root size).
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
/// p determines how many bits are stored, as well as implicitly determining
/// the high-order bit of the integer. </remarks>
public byte ScaleLocationCodes
{
get { return _scaleLocCodes; }
}
/// <summary>
/// Gets the root param for the location codes en-/decoding
/// </summary>
/// <remarks>The scale and root method of integer encoding needs two parameters,
/// which I'll call s (scale) and r (root size).
/// The integer is encoded as two parts, p (prefix) and q (actual bits).
/// p determines how many bits are stored, as well as implicitly determining
/// the high-order bit of the integer. </remarks>
public byte RootLocationCodes
{
get { return _rootLocCodes; }
}
/// <summary>
/// Gets the size in bytes of each index/leaf node
/// </summary>
public int NodeSize
{
get { return _nodeSize; }
}
/// <summary>
/// Gets the length of the longest word in the index
/// </summary>
private int LengthOfLongestWord
{
get { return _lengthOfLongestWord; }
}
/// <summary>
/// Gets the total number of words indexed (including duplicates)
/// </summary>
public int TotalWordCount
{
get { return _totalNumberOfWords; }
}
/// <summary>
/// Gets the total number of unique words indexed (excluding duplicates)
/// </summary>
public int UniqueWordCount
{
get { return _numberOfUniqueWords; }
}
/// <summary>
/// Gets the codepage identifier
/// </summary>
public int CodePage
{
get { return _codePage; }
}
/// <summary>
/// Gets the language code id
/// </summary>
public int LCID
{
get { return _lcid; }
}
public Encoding TextEncoder
{
get
{
return _textEncoder;
}
}
}
/// <summary>
/// Internal class for easier hit recording and rate-calculation
/// </summary>
private sealed class HitHelper : IComparable
{
/// <summary>
/// Internal member storing the associated document index
/// </summary>
private int _documentIndex = 0;
/// <summary>
/// Internal member storing the title
/// </summary>
private string _title = "";
/// <summary>
/// Internal member storing the locale
/// </summary>
private string _locale = "";
/// <summary>
/// Internal member storing the location
/// </summary>
private string _location = "";
/// <summary>
/// Internal member storing the url
/// </summary>
private string _url = "";
/// <summary>
/// Internal member storing the rating
/// </summary>
private double _rating = 0;
/// <summary>
/// Internal member used for rating calculation
/// </summary>
private Hashtable _partialRating = new Hashtable();
/// <summary>
/// Constructor of the class
/// </summary>
/// <param name="documentIndex">document index</param>
/// <param name="title">title</param>
/// <param name="locale">locale parameter</param>
/// <param name="location">location</param>
/// <param name="url">url of document</param>
/// <param name="rating">rating</param>
public HitHelper(int documentIndex, string title, string locale, string location, string url, double rating)
{
_documentIndex = documentIndex;
_title = title;
_locale = locale;
_location = location;
_url = url;
_rating = rating;
}
/// <summary>
/// Updates the rating for a found word
/// </summary>
/// <param name="word">word found</param>
public void UpdateRating(string word)
{
if( _partialRating[word] == null)
{
_partialRating[word] = 100.0;
}
else
{
_partialRating[word] = ((double)_partialRating[word])*1.01;
}
_rating = 0.0;
foreach(double val in _partialRating.Values)
{
_rating += val;
}
}
/// <summary>
/// Implements the CompareTo method of the IComparable interface.
/// Allows an easy sort by the document rating
/// </summary>
/// <param name="obj">object to compare</param>
/// <returns>0 ... equal, -1 ... this instance is less than obj, 1 ... this instance is greater than obj</returns>
public int CompareTo(object obj)
{
if( obj is HitHelper )
{
HitHelper hObj = (HitHelper)obj;
return this.Rating.CompareTo( hObj.Rating );
}
return -1;
}
/// <summary>
/// Gets the internal hashtable used for counting word hits of the document
/// </summary>
internal Hashtable PartialRating
{
get { return _partialRating; }
}
/// <summary>
/// Gets the document index of the hit helper instance
/// </summary>
public int DocumentIndex
{
get { return _documentIndex; }
}
/// <summary>
/// Gets the title
/// </summary>
public string Title
{
get { return _title; }
}
/// <summary>
/// Gets the locale
/// </summary>
public string Locale
{
get { return _locale; }
}
/// <summary>
/// Gets the location
/// </summary>
public string Location
{
get { return _location; }
}
/// <summary>
/// Gets the url
/// </summary>
public string URL
{
get { return _url; }
}
/// <summary>
/// Gets the rating
/// </summary>
public double Rating
{
get { return _rating; }
}
}
#endregion
/// <summary>
/// Regular expression getting the text between to quotes
/// </summary>
private string RE_Quotes = @"\""(?<innerText>.*?)\""";
/// <summary>
/// Internal flag specifying if the object is going to be disposed
/// </summary>
private bool disposed = false;
/// <summary>
/// Internal member storing the binary file data
/// </summary>
private byte[] _binaryFileData = null;
/// <summary>
/// Internal datatable storing the search hits
/// </summary>
private DataTable _hits =null;
/// <summary>
/// Internal arraylist for hit management
/// </summary>
private ArrayList _hitsHelper = new ArrayList();
/// <summary>
/// Internal member storing the header of the file
/// </summary>
private FTHeader _header = null;
/// <summary>
/// Internal member storing the associated chmfile object
/// </summary>
private CHMFile _associatedFile = null;
/// <summary>
/// Constructor of the class
/// </summary>
/// <param name="binaryFileData">binary file data of the $FIftiMain file</param>
/// <param name="associatedFile">associated chm file</param>
public FullTextEngine(byte[] binaryFileData, CHMFile associatedFile)
{
_binaryFileData = binaryFileData;
_associatedFile = associatedFile;
if(_associatedFile.SystemFile.FullTextSearch)
{
_header = new FTHeader(_binaryFileData); // reading header
}
}
/// <summary>
/// Standard constructor
/// </summary>
internal FullTextEngine()
{
}
#region Data dumping
/// <summary>
/// Dump the class data to a binary writer
/// </summary>
/// <param name="writer">writer to write the data</param>
internal void Dump(ref BinaryWriter writer)
{
_header.Dump(ref writer);
writer.Write( _binaryFileData.Length );
writer.Write(_binaryFileData);
}
/// <summary>
/// Reads the object data from a dump store
/// </summary>
/// <param name="reader">reader to read the data</param>
internal void ReadDump(ref BinaryReader reader)
{
_header = new FTHeader();
_header.ReadDump(ref reader);
int nCnt = reader.ReadInt32();
_binaryFileData = reader.ReadBytes(nCnt);
}
/// <summary>
/// Sets the associated CHMFile instance
/// </summary>
/// <param name="associatedFile">instance to set</param>
internal void SetCHMFile(CHMFile associatedFile)
{
_associatedFile = associatedFile;
}
#endregion
/// <summary>
/// Gets a flag if full-text searching is available for this chm file.
/// </summary>
public bool CanSearch
{
get { return (_associatedFile.SystemFile.FullTextSearch && (_header != null) ); }
}
/// <summary>
/// Performs a fulltext search of a single file.
/// </summary>
/// <param name="search">word(s) or phrase to search</param>
/// <param name="partialMatches">true if partial word should be matched also
/// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
/// <param name="titleOnly">true if only search in titles</param>
/// <remarks>Hits are available through the <see cref="Hits">Hists property</see>.</remarks>
public bool Search(string search, bool partialMatches, bool titleOnly)
{
return Search(search, -1, partialMatches, titleOnly);
}
/// <summary>
/// Performs a fulltext search of a single file.
/// </summary>
/// <param name="search">word(s) or phrase to search</param>
/// <param name="MaxHits">max hits. If this number is reached, the search will be interrupted</param>
/// <param name="partialMatches">true if partial word should be matched also
/// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
/// <param name="titleOnly">true if only search in titles</param>
/// <remarks>Hits are available through the <see cref="Hits">Hists property</see>.</remarks>
public bool Search(string search, int MaxHits, bool partialMatches, bool titleOnly)
{
if(CanSearch)
{
string searchString = search;
// Check if this is a quoted string
bool IsQuoted = (search.IndexOf("\"")>-1);
if(IsQuoted)
searchString = search.Replace("\"",""); // remove the quotes during search
bool bRet = true;
_hitsHelper = null;
_hitsHelper = new ArrayList();
_hits = null;
CreateHitsTable();
string[] words = searchString.Split(new char[] {' '});
for(int i=0; i<words.Length; i++)
{
bRet &= SearchSingleWord(words[i], MaxHits, partialMatches, titleOnly);
if(_hitsHelper.Count >= MaxHits)
break;
}
if(bRet && IsQuoted)
{
FinalizeQuoted(search);
}
if(bRet)
{
_hitsHelper.Sort();
int nhCount = MaxHits;
if( MaxHits < 0)
{
nhCount = _hitsHelper.Count;
}
if( nhCount > _hitsHelper.Count )
nhCount = _hitsHelper.Count;
// create hits datatable
for(int i=nhCount; i > 0; i--)
{
HitHelper curHlp = (HitHelper)(_hitsHelper[i-1]);
DataRow newRow = _hits.NewRow();
newRow["Rating"] = curHlp.Rating;
newRow["Title"] = curHlp.Title;
newRow["Locale"] = curHlp.Locale;
newRow["Location"] = curHlp.Location;
newRow["URL"] = curHlp.URL;
_hits.Rows.Add( newRow );
}
}
return bRet;
}
return false;
}
/// <summary>
/// Gets rid of all search hits which doesn't match the quoted phrase
/// </summary>
/// <param name="search">full search string entered by the user</param>
/// <remarks>Phrase search is not possible using the internal full-text index. We're just filtering all
/// documents which don't contain all words of the phrase.</remarks>
private void FinalizeQuoted(string search)
{
Regex quoteRE = new Regex(RE_Quotes, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
int innerTextIdx = quoteRE.GroupNumberFromName("innerText");
int nIndex = 0;
// get all phrases
while( quoteRE.IsMatch(search, nIndex) )
{
Match m = quoteRE.Match(search, nIndex);
string phrase = m.Groups["innerText"].Value;
string[] wordsInPhrase = phrase.Split( new char[] {' '} );
int nCnt = _hitsHelper.Count;
for(int i=0; i < _hitsHelper.Count; i++)
{
if( ! CheckHit( ((HitHelper)(_hitsHelper[i])), wordsInPhrase) )
_hitsHelper.RemoveAt(i--);
}
nIndex = m.Index+m.Length;
}
}
/// <summary>
/// Eliminates all search hits where not all of the words have been found
/// </summary>
/// <param name="hit">hithelper instance to check</param>
/// <param name="wordsInPhrase">word list</param>
private bool CheckHit(HitHelper hit, string[] wordsInPhrase)
{
for(int i=0; i<wordsInPhrase.Length;i++)
{
if( (hit.PartialRating[wordsInPhrase[i]] == null) || (((double)(hit.PartialRating[wordsInPhrase[i]])) == 0.0) )
return false;
}
return true;
}
/// <summary>
/// Performs a search for a single word in the index
/// </summary>
/// <param name="word">word to search</param>
/// <param name="MaxHits">maximal hits to return</param>
/// <param name="partialMatches">true if partial word should be matched also
/// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
/// <param name="titleOnly">true if only search in titles</param>
/// <returns>Returns true if succeeded</returns>
private bool SearchSingleWord(string word,int MaxHits, bool partialMatches, bool titleOnly)
{
string wordLower = word.ToLower();
MemoryStream memStream = new MemoryStream(_binaryFileData);
BinaryReader binReader = new BinaryReader(memStream);
// seek to root node
binReader.BaseStream.Seek( _header.RootOffset, SeekOrigin.Begin );
if( _header.Depth > 2 )
{
// unsupported index depth
Debug.WriteLine("FullTextSearcher.SearchSingleWord() - Failed with message: Unsupported index depth !");
Debug.WriteLine("File: " + _associatedFile.ChmFilePath);
Debug.WriteLine(" ");
return false;
}
if( _header.Depth > 1 )
{
// seek to the right leaf node ( if depth == 1, we are at the leaf node)
int freeSpace = binReader.ReadInt16();
for(int i=0; i < _header.PageCount; ++i)
{
// exstract index entries
int nWLength = (int)binReader.ReadByte();
int nCPosition = (int)binReader.ReadByte();
string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength-1, 0, true, _header.TextEncoder);
int nLeafOffset = binReader.ReadInt32();
binReader.ReadInt16(); // unknown
if( sName.CompareTo(wordLower) >= 0)
{
// store current position
long curPos = binReader.BaseStream.Position;
// seek to leaf offset
binReader.BaseStream.Seek( nLeafOffset, SeekOrigin.Begin );
// read leafnode
ReadLeafNode(ref binReader, word, MaxHits, partialMatches, titleOnly);
// return to current position and continue reading index nodes
binReader.BaseStream.Seek( curPos, SeekOrigin.Begin );
}
}
}
return true;
}
/// <summary>
/// Reads a leaf node and extracts documents which holds the searched word
/// </summary>
/// <param name="binReader">reference to the reader</param>
/// <param name="word">word to search</param>
/// <param name="MaxHits">maximal hits to return</param>
/// <param name="partialMatches">true if partial word should be matched also
/// ( if this is true a search of 'support' will match 'supports', otherwise not )</param>
/// <param name="titleOnly">true if only search in titles</param>
private void ReadLeafNode(ref BinaryReader binReader, string word, int MaxHits, bool partialMatches, bool titleOnly)
{
int nNextPageOffset = binReader.ReadInt32();
binReader.ReadInt16(); // unknown
int lfreeSpace = binReader.ReadInt16();
string curFullWord = "";
bool bFound = false;
string wordLower = word.ToLower();
for(;;)
{
if(binReader.BaseStream.Position >= binReader.BaseStream.Length)
break;
int nWLength = (int)binReader.ReadByte();
if(nWLength == 0)
break;
int nCPosition = (int)binReader.ReadByte();
string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength-1, 0, true, _header.TextEncoder);
int Context = (int)binReader.ReadByte(); // 0...body tag, 1...title tag, others unknown
long nrOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader);
int wclOffset = binReader.ReadInt32();
binReader.ReadInt16(); // unknown
long bytesOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader);
if( nCPosition > 0)
{
curFullWord = CombineStrings(curFullWord, sName, nCPosition);
}
else
{
curFullWord = sName;
}
bFound = false;
if(partialMatches)
bFound = ( curFullWord.IndexOf(wordLower) >= 0 );
else
bFound = (curFullWord == wordLower);
if( bFound )
{
if( (titleOnly && (Context==1)) || (!titleOnly) )
{
// store actual offset
long curPos = binReader.BaseStream.Position;
// found the word, begin with WCL encoding
binReader.BaseStream.Seek(wclOffset, SeekOrigin.Begin );
byte[] wclBytes = binReader.ReadBytes((int)bytesOfWCL);
DecodeWCL(wclBytes, MaxHits, word);
// back and continue reading leafnodes
binReader.BaseStream.Seek(curPos, SeekOrigin.Begin );
}
}
}
}
/// <summary>
/// Decodes the s/r encoded WordCodeList (=wcl) and creates hit entries
/// </summary>
/// <param name="wclBytes">wcl encoded byte array</param>
/// <param name="MaxHits">maximal hits</param>
/// <param name="word">the word to find</param>
private void DecodeWCL(byte[] wclBytes,int MaxHits, string word)
{
byte[] wclBits = new byte[ wclBytes.Length*8 ];
int nBitIdx=0;
for(int i=0; i<wclBytes.Length; i++)
{
for(int j=0; j<8; j++)
{
wclBits[nBitIdx] = ((byte)(wclBytes[i] & ((byte)( (byte)0x1 << (7-j) )))) > (byte)0 ? (byte)1 : (byte)0;
nBitIdx++;
}
}
nBitIdx = 0;
int nDocIdx = 0; // delta encoded
while(nBitIdx < wclBits.Length)
{
nDocIdx += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleDocumentIndex, _header.RootDocumentIndex, ref nBitIdx);
int nCodeCnt = BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleCodeCount, _header.RootCodeCount, ref nBitIdx);
int nWordLocation = 0; // delta encoded
for(int locidx=0; locidx<nCodeCnt; locidx++)
{
nWordLocation += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleLocationCodes, _header.RootLocationCodes, ref nBitIdx);
}
// apply padding
while( (nBitIdx % 8) != 0)
nBitIdx++;
// Record hit
HitHelper hitObj = DocumentHit(nDocIdx);
if(hitObj == null)
{
if(_hitsHelper.Count > MaxHits)
return;
hitObj = new HitHelper(nDocIdx, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Title,
((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Locale, _associatedFile.CompileFile,
((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).URL, 0.0);
for(int k=0;k<nCodeCnt;k++)
hitObj.UpdateRating(word);
_hitsHelper.Add(hitObj);
}
else
{
for(int k=0;k<nCodeCnt;k++)
hitObj.UpdateRating(word);
}
}
}
/// <summary>
/// Combines a "master" word with a partial word.
/// </summary>
/// <param name="word">the master word</param>
/// <param name="partial">the partial word</param>
/// <param name="partialPosition">position to place the parial word</param>
/// <returns>returns a combined string</returns>
private string CombineStrings(string word, string partial, int partialPosition)
{
string sCombined = word;
int i=0;
for(i=0; i<partial.Length; i++)
{
if( (i+partialPosition) > (sCombined.Length-1) )
{
sCombined += partial[i];
}
else
{
StringBuilder sb = new StringBuilder(sCombined);
sb.Replace( sCombined[partialPosition+i], partial[i], partialPosition+i, 1);
sCombined = sb.ToString();
}
}
if(! ((i+partialPosition) > (sCombined.Length-1)) )
{
sCombined = sCombined.Substring(0, partialPosition+partial.Length);
}
return sCombined;
}
/// <summary>
/// Gets the HitHelper instance for a specific document index
/// </summary>
/// <param name="index">document index</param>
/// <returns>The reference of the hithelper instance for this document index, otherwise null</returns>
private HitHelper DocumentHit(int index)
{
foreach(HitHelper curObj in _hitsHelper)
{
if( curObj.DocumentIndex == index)
return curObj;
}
return null;
}
/// <summary>
/// Creates a DataTable for storing the hits
/// </summary>
private void CreateHitsTable()
{
_hits = new DataTable("FT_Search_Hits");
DataColumn ftColumn;
ftColumn = new DataColumn();
ftColumn.DataType = System.Type.GetType("System.Double");
ftColumn.ColumnName = "Rating";
ftColumn.ReadOnly = false;
ftColumn.Unique = false;
_hits.Columns.Add(ftColumn);
ftColumn = new DataColumn();
ftColumn.DataType = System.Type.GetType("System.String");
ftColumn.ColumnName = "Title";
ftColumn.ReadOnly = false;
ftColumn.Unique = false;
_hits.Columns.Add(ftColumn);
ftColumn = new DataColumn();
ftColumn.DataType = System.Type.GetType("System.String");
ftColumn.ColumnName = "Locale";
ftColumn.ReadOnly = false;
ftColumn.Unique = false;
_hits.Columns.Add(ftColumn);
ftColumn = new DataColumn();
ftColumn.DataType = System.Type.GetType("System.String");
ftColumn.ColumnName = "Location";
ftColumn.ReadOnly = false;
ftColumn.Unique = false;
_hits.Columns.Add(ftColumn);
ftColumn = new DataColumn();
ftColumn.DataType = System.Type.GetType("System.String");
ftColumn.ColumnName = "URL";
ftColumn.ReadOnly = false;
ftColumn.Unique = false;
_hits.Columns.Add(ftColumn);
}
/// <summary>
/// Gets an datatable containing the hits of the last search
/// </summary>
public DataTable Hits
{
get { return _hits; }
}
/// <summary>
/// Implement IDisposable.
/// </summary>
public void Dispose()
{
Dispose(true);
// This object will be cleaned up by the Dispose method.
// Therefore, you should call GC.SupressFinalize to
// take this object off the finalization queue
// and prevent finalization code for this object
// from executing a second time.
GC.SuppressFinalize(this);
}
/// <summary>
/// Dispose(bool disposing) executes in two distinct scenarios.
/// If disposing equals true, the method has been called directly
/// or indirectly by a user's code. Managed and unmanaged resources
/// can be disposed.
/// If disposing equals false, the method has been called by the
/// runtime from inside the finalizer and you should not reference
/// other objects. Only unmanaged resources can be disposed.
/// </summary>
/// <param name="disposing">disposing flag</param>
private void Dispose(bool disposing)
{
// Check to see if Dispose has already been called.
if(!this.disposed)
{
// If disposing equals true, dispose all managed
// and unmanaged resources.
if(disposing)
{
// Dispose managed resources.
_binaryFileData = null;
}
}
disposed = true;
}
}
}