using System; using System.Data; using System.Diagnostics; using System.Text; using System.Text.RegularExpressions; using System.IO; using System.Collections; using System.Globalization; namespace HtmlHelp.ChmDecoding { /// /// The class FullTextSearcher implements a fulltext searcher for a single chm file ! /// internal sealed class FullTextEngine : IDisposable { #region Internal helper classes /// /// Internal class for decoding the header /// private sealed class FTHeader { /// /// Internal member storing the number of indexed files /// private int _numberOfIndexFiles = 0; /// /// Internal member storing the offset of the root node /// private int _rootOffset = 0; /// /// Internal member storing the index-page count /// private int _pageCount = 0; /// /// Internal member storing the depth of the tree /// private int _depth = 0; /// /// Internal member storing the scale param for document index en-/decoding /// private byte _scaleDocIdx = 0; /// /// Internal member storing the scale param for code-count en-/decoding /// private byte _scaleCodeCnt = 0; /// /// Internal member storing the scale param for location codes en-/decoding /// private byte _scaleLocCodes = 0; /// /// Internal member storing the root param for document index en-/decoding /// private byte _rootDocIdx = 0; /// /// Internal member storing the root param for code-count en-/decoding /// private byte _rootCodeCnt = 0; /// /// Internal member storing the root param for location codes en-/decoding /// private byte _rootLocCodes = 0; /// /// Internal member storing the size of the nodes in bytes /// private int _nodeSize = 0; /// /// Internal member storing the length of the longest word /// private int _lengthOfLongestWord = 0; /// /// Internal member storing the total number of words /// private int _totalNumberOfWords = 0; /// /// Internal member storing the total number of unique words /// private int _numberOfUniqueWords = 0; /// /// Internal member storing the codepage identifier /// private int _codePage = 1252; /// /// Internal member storing the language code id /// private int _lcid = 1033; /// /// Internal member storing the text encoder /// private Encoding _textEncoder = Encoding.Default; /// /// Constructor of the header /// /// binary data from which the header will be extracted public FTHeader(byte[] binaryData) { DecodeHeader(binaryData); } /// /// Internal constructor for reading from dump /// internal FTHeader() { } /// /// Decodes the binary header information and fills the members /// /// binary data from which the header will be extracted private void DecodeHeader(byte[] binaryData) { MemoryStream memStream = new MemoryStream(binaryData); BinaryReader binReader = new BinaryReader(memStream); binReader.ReadBytes(4); // 4 unknown bytes _numberOfIndexFiles = binReader.ReadInt32(); // number of indexed files binReader.ReadInt32(); // unknown binReader.ReadInt32(); // unknown _pageCount = binReader.ReadInt32(); // page-count _rootOffset = binReader.ReadInt32(); // file offset of the root node _depth = binReader.ReadInt16(); // depth of the tree binReader.ReadInt32(); // unknown _scaleDocIdx = binReader.ReadByte(); _rootDocIdx = binReader.ReadByte(); _scaleCodeCnt = binReader.ReadByte(); _rootCodeCnt = binReader.ReadByte(); _scaleLocCodes = binReader.ReadByte(); _rootLocCodes = binReader.ReadByte(); if( (_scaleDocIdx != 2) || ( _scaleCodeCnt != 2 ) || ( _scaleLocCodes != 2 ) ) { Debug.WriteLine("Unsupported scale for s/r encoding !"); throw new InvalidOperationException("Unsupported scale for s/r encoding !"); } binReader.ReadBytes(10); // unknown _nodeSize = binReader.ReadInt32(); binReader.ReadInt32(); // unknown binReader.ReadInt32(); // not important binReader.ReadInt32(); // not important _lengthOfLongestWord = binReader.ReadInt32(); _totalNumberOfWords = binReader.ReadInt32(); _numberOfUniqueWords = binReader.ReadInt32(); binReader.ReadInt32(); // not important binReader.ReadInt32(); // not important binReader.ReadInt32(); // not important binReader.ReadInt32(); // not important binReader.ReadInt32(); // not important binReader.ReadInt32(); // not important binReader.ReadBytes(24); // not important _codePage = binReader.ReadInt32(); _lcid = binReader.ReadInt32(); CultureInfo ci = new CultureInfo(_lcid); _textEncoder = Encoding.GetEncoding( ci.TextInfo.ANSICodePage ); // rest of header is not important for us } /// /// Dump the class data to a binary writer /// /// writer to write the data internal void Dump(ref BinaryWriter writer) { writer.Write( _numberOfIndexFiles ); writer.Write( _rootOffset ); writer.Write( _pageCount ); writer.Write( _depth ); writer.Write( _scaleDocIdx ); writer.Write( _rootDocIdx ); writer.Write( _scaleCodeCnt ); writer.Write( _rootCodeCnt ); writer.Write( _scaleLocCodes ); writer.Write( _rootLocCodes ); writer.Write( _nodeSize ); writer.Write( _lengthOfLongestWord ); writer.Write( _totalNumberOfWords ); writer.Write( _numberOfUniqueWords ); } /// /// Reads the object data from a dump store /// /// reader to read the data internal void ReadDump(ref BinaryReader reader) { _numberOfIndexFiles = reader.ReadInt32(); _rootOffset = reader.ReadInt32(); _pageCount = reader.ReadInt32(); _depth = reader.ReadInt32(); _scaleDocIdx = reader.ReadByte(); _rootDocIdx = reader.ReadByte(); _scaleCodeCnt = reader.ReadByte(); _rootCodeCnt = reader.ReadByte(); _scaleLocCodes = reader.ReadByte(); _rootLocCodes = reader.ReadByte(); _nodeSize = reader.ReadInt32(); _lengthOfLongestWord = reader.ReadInt32(); _totalNumberOfWords = reader.ReadInt32(); _numberOfUniqueWords = reader.ReadInt32(); } /// /// Gets the number of indexed files /// public int IndexedFileCount { get { return _numberOfIndexFiles; } } /// /// Gets the file offset of the root node /// public int RootOffset { get { return _rootOffset; } } /// /// Gets the page count /// public int PageCount { get { return _pageCount; } } /// /// Gets the index depth /// public int Depth { get { return _depth; } } /// /// Gets the scale param for document index en-/decoding /// /// The scale and root method of integer encoding needs two parameters, /// which I'll call s (scale) and r (root size). /// The integer is encoded as two parts, p (prefix) and q (actual bits). /// p determines how many bits are stored, as well as implicitly determining /// the high-order bit of the integer. public byte ScaleDocumentIndex { get { return _scaleDocIdx; } } /// /// Gets the root param for the document index en-/decoding /// /// The scale and root method of integer encoding needs two parameters, /// which I'll call s (scale) and r (root size). /// The integer is encoded as two parts, p (prefix) and q (actual bits). /// p determines how many bits are stored, as well as implicitly determining /// the high-order bit of the integer. public byte RootDocumentIndex { get { return _rootDocIdx; } } /// /// Gets the scale param for the code-count en-/decoding /// /// The scale and root method of integer encoding needs two parameters, /// which I'll call s (scale) and r (root size). /// The integer is encoded as two parts, p (prefix) and q (actual bits). /// p determines how many bits are stored, as well as implicitly determining /// the high-order bit of the integer. public byte ScaleCodeCount { get { return _scaleCodeCnt; } } /// /// Gets the root param for the code-count en-/decoding /// /// The scale and root method of integer encoding needs two parameters, /// which I'll call s (scale) and r (root size). /// The integer is encoded as two parts, p (prefix) and q (actual bits). /// p determines how many bits are stored, as well as implicitly determining /// the high-order bit of the integer. public byte RootCodeCount { get { return _rootCodeCnt; } } /// /// Gets the scale param for the location codes en-/decoding /// /// The scale and root method of integer encoding needs two parameters, /// which I'll call s (scale) and r (root size). /// The integer is encoded as two parts, p (prefix) and q (actual bits). /// p determines how many bits are stored, as well as implicitly determining /// the high-order bit of the integer. public byte ScaleLocationCodes { get { return _scaleLocCodes; } } /// /// Gets the root param for the location codes en-/decoding /// /// The scale and root method of integer encoding needs two parameters, /// which I'll call s (scale) and r (root size). /// The integer is encoded as two parts, p (prefix) and q (actual bits). /// p determines how many bits are stored, as well as implicitly determining /// the high-order bit of the integer. public byte RootLocationCodes { get { return _rootLocCodes; } } /// /// Gets the size in bytes of each index/leaf node /// public int NodeSize { get { return _nodeSize; } } /// /// Gets the length of the longest word in the index /// private int LengthOfLongestWord { get { return _lengthOfLongestWord; } } /// /// Gets the total number of words indexed (including duplicates) /// public int TotalWordCount { get { return _totalNumberOfWords; } } /// /// Gets the total number of unique words indexed (excluding duplicates) /// public int UniqueWordCount { get { return _numberOfUniqueWords; } } /// /// Gets the codepage identifier /// public int CodePage { get { return _codePage; } } /// /// Gets the language code id /// public int LCID { get { return _lcid; } } public Encoding TextEncoder { get { return _textEncoder; } } } /// /// Internal class for easier hit recording and rate-calculation /// private sealed class HitHelper : IComparable { /// /// Internal member storing the associated document index /// private int _documentIndex = 0; /// /// Internal member storing the title /// private string _title = ""; /// /// Internal member storing the locale /// private string _locale = ""; /// /// Internal member storing the location /// private string _location = ""; /// /// Internal member storing the url /// private string _url = ""; /// /// Internal member storing the rating /// private double _rating = 0; /// /// Internal member used for rating calculation /// private Hashtable _partialRating = new Hashtable(); /// /// Constructor of the class /// /// document index /// title /// locale parameter /// location /// url of document /// rating public HitHelper(int documentIndex, string title, string locale, string location, string url, double rating) { _documentIndex = documentIndex; _title = title; _locale = locale; _location = location; _url = url; _rating = rating; } /// /// Updates the rating for a found word /// /// word found public void UpdateRating(string word) { if( _partialRating[word] == null) { _partialRating[word] = 100.0; } else { _partialRating[word] = ((double)_partialRating[word])*1.01; } _rating = 0.0; foreach(double val in _partialRating.Values) { _rating += val; } } /// /// Implements the CompareTo method of the IComparable interface. /// Allows an easy sort by the document rating /// /// object to compare /// 0 ... equal, -1 ... this instance is less than obj, 1 ... this instance is greater than obj public int CompareTo(object obj) { if( obj is HitHelper ) { HitHelper hObj = (HitHelper)obj; return this.Rating.CompareTo( hObj.Rating ); } return -1; } /// /// Gets the internal hashtable used for counting word hits of the document /// internal Hashtable PartialRating { get { return _partialRating; } } /// /// Gets the document index of the hit helper instance /// public int DocumentIndex { get { return _documentIndex; } } /// /// Gets the title /// public string Title { get { return _title; } } /// /// Gets the locale /// public string Locale { get { return _locale; } } /// /// Gets the location /// public string Location { get { return _location; } } /// /// Gets the url /// public string URL { get { return _url; } } /// /// Gets the rating /// public double Rating { get { return _rating; } } } #endregion /// /// Regular expression getting the text between to quotes /// private string RE_Quotes = @"\""(?.*?)\"""; /// /// Internal flag specifying if the object is going to be disposed /// private bool disposed = false; /// /// Internal member storing the binary file data /// private byte[] _binaryFileData = null; /// /// Internal datatable storing the search hits /// private DataTable _hits =null; /// /// Internal arraylist for hit management /// private ArrayList _hitsHelper = new ArrayList(); /// /// Internal member storing the header of the file /// private FTHeader _header = null; /// /// Internal member storing the associated chmfile object /// private CHMFile _associatedFile = null; /// /// Constructor of the class /// /// binary file data of the $FIftiMain file /// associated chm file public FullTextEngine(byte[] binaryFileData, CHMFile associatedFile) { _binaryFileData = binaryFileData; _associatedFile = associatedFile; if(_associatedFile.SystemFile.FullTextSearch) { _header = new FTHeader(_binaryFileData); // reading header } } /// /// Standard constructor /// internal FullTextEngine() { } #region Data dumping /// /// Dump the class data to a binary writer /// /// writer to write the data internal void Dump(ref BinaryWriter writer) { _header.Dump(ref writer); writer.Write( _binaryFileData.Length ); writer.Write(_binaryFileData); } /// /// Reads the object data from a dump store /// /// reader to read the data internal void ReadDump(ref BinaryReader reader) { _header = new FTHeader(); _header.ReadDump(ref reader); int nCnt = reader.ReadInt32(); _binaryFileData = reader.ReadBytes(nCnt); } /// /// Sets the associated CHMFile instance /// /// instance to set internal void SetCHMFile(CHMFile associatedFile) { _associatedFile = associatedFile; } #endregion /// /// Gets a flag if full-text searching is available for this chm file. /// public bool CanSearch { get { return (_associatedFile.SystemFile.FullTextSearch && (_header != null) ); } } /// /// Performs a fulltext search of a single file. /// /// word(s) or phrase to search /// true if partial word should be matched also /// ( if this is true a search of 'support' will match 'supports', otherwise not ) /// true if only search in titles /// Hits are available through the Hists property. public bool Search(string search, bool partialMatches, bool titleOnly) { return Search(search, -1, partialMatches, titleOnly); } /// /// Performs a fulltext search of a single file. /// /// word(s) or phrase to search /// max hits. If this number is reached, the search will be interrupted /// true if partial word should be matched also /// ( if this is true a search of 'support' will match 'supports', otherwise not ) /// true if only search in titles /// Hits are available through the Hists property. public bool Search(string search, int MaxHits, bool partialMatches, bool titleOnly) { if(CanSearch) { string searchString = search; // Check if this is a quoted string bool IsQuoted = (search.IndexOf("\"")>-1); if(IsQuoted) searchString = search.Replace("\"",""); // remove the quotes during search bool bRet = true; _hitsHelper = null; _hitsHelper = new ArrayList(); _hits = null; CreateHitsTable(); string[] words = searchString.Split(new char[] {' '}); for(int i=0; i= MaxHits) break; } if(bRet && IsQuoted) { FinalizeQuoted(search); } if(bRet) { _hitsHelper.Sort(); int nhCount = MaxHits; if( MaxHits < 0) { nhCount = _hitsHelper.Count; } if( nhCount > _hitsHelper.Count ) nhCount = _hitsHelper.Count; // create hits datatable for(int i=nhCount; i > 0; i--) { HitHelper curHlp = (HitHelper)(_hitsHelper[i-1]); DataRow newRow = _hits.NewRow(); newRow["Rating"] = curHlp.Rating; newRow["Title"] = curHlp.Title; newRow["Locale"] = curHlp.Locale; newRow["Location"] = curHlp.Location; newRow["URL"] = curHlp.URL; _hits.Rows.Add( newRow ); } } return bRet; } return false; } /// /// Gets rid of all search hits which doesn't match the quoted phrase /// /// full search string entered by the user /// Phrase search is not possible using the internal full-text index. We're just filtering all /// documents which don't contain all words of the phrase. private void FinalizeQuoted(string search) { Regex quoteRE = new Regex(RE_Quotes, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); int innerTextIdx = quoteRE.GroupNumberFromName("innerText"); int nIndex = 0; // get all phrases while( quoteRE.IsMatch(search, nIndex) ) { Match m = quoteRE.Match(search, nIndex); string phrase = m.Groups["innerText"].Value; string[] wordsInPhrase = phrase.Split( new char[] {' '} ); int nCnt = _hitsHelper.Count; for(int i=0; i < _hitsHelper.Count; i++) { if( ! CheckHit( ((HitHelper)(_hitsHelper[i])), wordsInPhrase) ) _hitsHelper.RemoveAt(i--); } nIndex = m.Index+m.Length; } } /// /// Eliminates all search hits where not all of the words have been found /// /// hithelper instance to check /// word list private bool CheckHit(HitHelper hit, string[] wordsInPhrase) { for(int i=0; i /// Performs a search for a single word in the index /// /// word to search /// maximal hits to return /// true if partial word should be matched also /// ( if this is true a search of 'support' will match 'supports', otherwise not ) /// true if only search in titles /// Returns true if succeeded private bool SearchSingleWord(string word,int MaxHits, bool partialMatches, bool titleOnly) { string wordLower = word.ToLower(); MemoryStream memStream = new MemoryStream(_binaryFileData); BinaryReader binReader = new BinaryReader(memStream); // seek to root node binReader.BaseStream.Seek( _header.RootOffset, SeekOrigin.Begin ); if( _header.Depth > 2 ) { // unsupported index depth Debug.WriteLine("FullTextSearcher.SearchSingleWord() - Failed with message: Unsupported index depth !"); Debug.WriteLine("File: " + _associatedFile.ChmFilePath); Debug.WriteLine(" "); return false; } if( _header.Depth > 1 ) { // seek to the right leaf node ( if depth == 1, we are at the leaf node) int freeSpace = binReader.ReadInt16(); for(int i=0; i < _header.PageCount; ++i) { // exstract index entries int nWLength = (int)binReader.ReadByte(); int nCPosition = (int)binReader.ReadByte(); string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength-1, 0, true, _header.TextEncoder); int nLeafOffset = binReader.ReadInt32(); binReader.ReadInt16(); // unknown if( sName.CompareTo(wordLower) >= 0) { // store current position long curPos = binReader.BaseStream.Position; // seek to leaf offset binReader.BaseStream.Seek( nLeafOffset, SeekOrigin.Begin ); // read leafnode ReadLeafNode(ref binReader, word, MaxHits, partialMatches, titleOnly); // return to current position and continue reading index nodes binReader.BaseStream.Seek( curPos, SeekOrigin.Begin ); } } } return true; } /// /// Reads a leaf node and extracts documents which holds the searched word /// /// reference to the reader /// word to search /// maximal hits to return /// true if partial word should be matched also /// ( if this is true a search of 'support' will match 'supports', otherwise not ) /// true if only search in titles private void ReadLeafNode(ref BinaryReader binReader, string word, int MaxHits, bool partialMatches, bool titleOnly) { int nNextPageOffset = binReader.ReadInt32(); binReader.ReadInt16(); // unknown int lfreeSpace = binReader.ReadInt16(); string curFullWord = ""; bool bFound = false; string wordLower = word.ToLower(); for(;;) { if(binReader.BaseStream.Position >= binReader.BaseStream.Length) break; int nWLength = (int)binReader.ReadByte(); if(nWLength == 0) break; int nCPosition = (int)binReader.ReadByte(); string sName = BinaryReaderHelp.ExtractString(ref binReader, nWLength-1, 0, true, _header.TextEncoder); int Context = (int)binReader.ReadByte(); // 0...body tag, 1...title tag, others unknown long nrOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader); int wclOffset = binReader.ReadInt32(); binReader.ReadInt16(); // unknown long bytesOfWCL = BinaryReaderHelp.ReadENCINT(ref binReader); if( nCPosition > 0) { curFullWord = CombineStrings(curFullWord, sName, nCPosition); } else { curFullWord = sName; } bFound = false; if(partialMatches) bFound = ( curFullWord.IndexOf(wordLower) >= 0 ); else bFound = (curFullWord == wordLower); if( bFound ) { if( (titleOnly && (Context==1)) || (!titleOnly) ) { // store actual offset long curPos = binReader.BaseStream.Position; // found the word, begin with WCL encoding binReader.BaseStream.Seek(wclOffset, SeekOrigin.Begin ); byte[] wclBytes = binReader.ReadBytes((int)bytesOfWCL); DecodeWCL(wclBytes, MaxHits, word); // back and continue reading leafnodes binReader.BaseStream.Seek(curPos, SeekOrigin.Begin ); } } } } /// /// Decodes the s/r encoded WordCodeList (=wcl) and creates hit entries /// /// wcl encoded byte array /// maximal hits /// the word to find private void DecodeWCL(byte[] wclBytes,int MaxHits, string word) { byte[] wclBits = new byte[ wclBytes.Length*8 ]; int nBitIdx=0; for(int i=0; i (byte)0 ? (byte)1 : (byte)0; nBitIdx++; } } nBitIdx = 0; int nDocIdx = 0; // delta encoded while(nBitIdx < wclBits.Length) { nDocIdx += BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleDocumentIndex, _header.RootDocumentIndex, ref nBitIdx); int nCodeCnt = BinaryReaderHelp.ReadSRItem(wclBits, _header.ScaleCodeCount, _header.RootCodeCount, ref nBitIdx); int nWordLocation = 0; // delta encoded for(int locidx=0; locidx MaxHits) return; hitObj = new HitHelper(nDocIdx, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Title, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).Locale, _associatedFile.CompileFile, ((TopicEntry)(_associatedFile.TopicsFile.TopicTable[nDocIdx])).URL, 0.0); for(int k=0;k /// Combines a "master" word with a partial word. /// /// the master word /// the partial word /// position to place the parial word /// returns a combined string private string CombineStrings(string word, string partial, int partialPosition) { string sCombined = word; int i=0; for(i=0; i (sCombined.Length-1) ) { sCombined += partial[i]; } else { StringBuilder sb = new StringBuilder(sCombined); sb.Replace( sCombined[partialPosition+i], partial[i], partialPosition+i, 1); sCombined = sb.ToString(); } } if(! ((i+partialPosition) > (sCombined.Length-1)) ) { sCombined = sCombined.Substring(0, partialPosition+partial.Length); } return sCombined; } /// /// Gets the HitHelper instance for a specific document index /// /// document index /// The reference of the hithelper instance for this document index, otherwise null private HitHelper DocumentHit(int index) { foreach(HitHelper curObj in _hitsHelper) { if( curObj.DocumentIndex == index) return curObj; } return null; } /// /// Creates a DataTable for storing the hits /// private void CreateHitsTable() { _hits = new DataTable("FT_Search_Hits"); DataColumn ftColumn; ftColumn = new DataColumn(); ftColumn.DataType = System.Type.GetType("System.Double"); ftColumn.ColumnName = "Rating"; ftColumn.ReadOnly = false; ftColumn.Unique = false; _hits.Columns.Add(ftColumn); ftColumn = new DataColumn(); ftColumn.DataType = System.Type.GetType("System.String"); ftColumn.ColumnName = "Title"; ftColumn.ReadOnly = false; ftColumn.Unique = false; _hits.Columns.Add(ftColumn); ftColumn = new DataColumn(); ftColumn.DataType = System.Type.GetType("System.String"); ftColumn.ColumnName = "Locale"; ftColumn.ReadOnly = false; ftColumn.Unique = false; _hits.Columns.Add(ftColumn); ftColumn = new DataColumn(); ftColumn.DataType = System.Type.GetType("System.String"); ftColumn.ColumnName = "Location"; ftColumn.ReadOnly = false; ftColumn.Unique = false; _hits.Columns.Add(ftColumn); ftColumn = new DataColumn(); ftColumn.DataType = System.Type.GetType("System.String"); ftColumn.ColumnName = "URL"; ftColumn.ReadOnly = false; ftColumn.Unique = false; _hits.Columns.Add(ftColumn); } /// /// Gets an datatable containing the hits of the last search /// public DataTable Hits { get { return _hits; } } /// /// Implement IDisposable. /// public void Dispose() { Dispose(true); // This object will be cleaned up by the Dispose method. // Therefore, you should call GC.SupressFinalize to // take this object off the finalization queue // and prevent finalization code for this object // from executing a second time. GC.SuppressFinalize(this); } /// /// Dispose(bool disposing) executes in two distinct scenarios. /// If disposing equals true, the method has been called directly /// or indirectly by a user's code. Managed and unmanaged resources /// can be disposed. /// If disposing equals false, the method has been called by the /// runtime from inside the finalizer and you should not reference /// other objects. Only unmanaged resources can be disposed. /// /// disposing flag private void Dispose(bool disposing) { // Check to see if Dispose has already been called. if(!this.disposed) { // If disposing equals true, dispose all managed // and unmanaged resources. if(disposing) { // Dispose managed resources. _binaryFileData = null; } } disposed = true; } } }