using System; using System.Collections; using System.Text; using System.Text.RegularExpressions; namespace HtmlHelp.ChmDecoding { /// /// The class HHCParser implements a parser for HHC contents files. /// internal sealed class HHCParser { /// /// regular expressions for replacing the sitemap boundary tags /// private static string RE_ULOpening = @"\"; // will be replaced by a '(' for nested parsing private static string RE_ULClosing = @"\"; // will be replaced by a ')' for nested parsing /// /// Matching ul-tags /// private static string RE_ULBoundaries = @"\(?.*)\"; /// /// Matching the nested tree structure. /// private static string RE_NestedBoundaries = @"\( (?> [^()]+ | \( (?) | \) (?<-DEPTH>) )* (?(DEPTH)(?!)) \)"; /// /// Matching object-tags /// private static string RE_ObjectBoundaries = @"\.*?)\"; /// /// Matching param tags /// private static string RE_ParamBoundaries = @"\.*?)\>"; /// /// Extracting tag attributes /// private const string RE_QuoteAttributes = @"( |\t)*(?[\-a-zA-Z0-9]*)( |\t)*=( |\t)*(?[\""\'])?(?.*?(?(attributeTD)\k|([\s>]|.$)))"; /// /// private regular expressionobjects /// private static Regex ulRE; private static Regex NestedRE; private static Regex ObjectRE; private static Regex ParamRE; private static Regex AttributesRE; /// /// Internal member storing the list of TOCItems which are holding merge links /// private static ArrayList _mergeItems = null; /// /// Internal member storing the last read regular topic item. /// This is used to handle "Merge" entries and add them as child to this instance. /// private static TOCItem _lastTopicItem = null; /// /// Parses a HHC file and returns an ArrayList with the table of contents (TOC) tree /// /// string content of the hhc file /// CHMFile instance /// Returns an ArrayList with the table of contents (TOC) tree public static ArrayList ParseHHC(string hhcFile, CHMFile chmFile) { _lastTopicItem = null; _mergeItems = null; // clear merged item list ArrayList tocList = new ArrayList(); ulRE = new Regex(RE_ULBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); NestedRE = new Regex(RE_NestedBoundaries, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); ObjectRE = new Regex(RE_ObjectBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); ParamRE = new Regex(RE_ParamBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); AttributesRE = new Regex(RE_QuoteAttributes, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); int innerTextIdx = ulRE.GroupNumberFromName("innerText"); if( ulRE.IsMatch(hhcFile, 0) ) { Match m = ulRE.Match(hhcFile, 0); int nFirstUL = 0; nFirstUL = hhcFile.ToLower().IndexOf("
    "); if(nFirstUL == -1) nFirstUL = hhcFile.ToLower().IndexOf(""); if( ObjectRE.IsMatch(hhcFile, 0) ) // first object block contains information types and categories { Match mO = ObjectRE.Match(hhcFile, 0); int iOTxt = ObjectRE.GroupNumberFromName("innerText"); string globalText = mO.Groups[iOTxt].Value; if( mO.Groups[iOTxt].Index <= nFirstUL) ParseGlobalSettings( globalText, chmFile ); } // parse toc tree string innerText = m.Groups["innerText"].Value; innerText = innerText.Replace("(", "("); innerText = innerText.Replace(")", ")"); innerText = Regex.Replace(innerText, RE_ULOpening, "(", RegexOptions.IgnoreCase); innerText = Regex.Replace(innerText, RE_ULClosing, ")", RegexOptions.IgnoreCase); ParseTree( innerText, null, tocList, chmFile ); } return tocList; } /// /// Checks if the hhc file contains a global object tag. /// /// string content of the hhc file /// chm file /// true if the hhc content contains a global object tag public static bool HasGlobalObjectTag(string hhcFile, CHMFile chmFile) { bool bRet = false; ulRE = new Regex(RE_ULBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); ObjectRE = new Regex(RE_ObjectBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline); int innerTextIdx = ulRE.GroupNumberFromName("innerText"); if( ulRE.IsMatch(hhcFile, 0) ) { Match m = ulRE.Match(hhcFile, 0); int nFirstUL = 0; nFirstUL = hhcFile.ToLower().IndexOf("
      "); if(nFirstUL == -1) nFirstUL = hhcFile.ToLower().IndexOf(""); if( ObjectRE.IsMatch(hhcFile, 0) ) // first object block contains information types and categories { Match mO = ObjectRE.Match(hhcFile, 0); int iOTxt = ObjectRE.GroupNumberFromName("innerText"); string globalText = mO.Groups[iOTxt].Value; if( mO.Groups[iOTxt].Index <= nFirstUL) bRet = true; } } return bRet; } /// /// Gets true if the previously done parsing found merge-links /// public static bool HasMergeLinks { get { if(_mergeItems==null) return false; return _mergeItems.Count > 0; } } /// /// Gets all TOCItem references which are holding merge-links /// public static ArrayList MergeItems { get { return _mergeItems; } } /// /// Recursively parses a sitemap tree /// /// content text /// Parent for all read items /// arraylist which receives the extracted nodes /// CHMFile instance private static void ParseTree( string text, TOCItem parent, ArrayList arrNodes, CHMFile chmFile ) { string strPreItems="", strPostItems=""; string innerText = ""; int nIndex = 0; while( NestedRE.IsMatch(text, nIndex) ) { Match m = NestedRE.Match(text, nIndex); innerText = m.Value.Substring( 1, m.Length-2); strPreItems = text.Substring(nIndex,m.Index-nIndex); ParseItems(strPreItems, parent, arrNodes, chmFile); if((arrNodes.Count>0) && (innerText.Length > 0) ) { TOCItem p = ((TOCItem)(arrNodes[arrNodes.Count-1])); ParseTree( innerText, p, p.Children, chmFile ); } nIndex = m.Index+m.Length; } if( nIndex == 0) { strPostItems = text.Substring(nIndex, text.Length-nIndex); ParseItems(strPostItems, parent, arrNodes, chmFile); } else if( nIndex < text.Length-1) { strPostItems = text.Substring(nIndex, text.Length-nIndex); ParseTree(strPostItems, parent, arrNodes, chmFile); } } /// /// Parses tree nodes from the text /// /// text containing the items /// Parent for all read items /// arraylist where the nodes should be added /// CHMFile instance private static void ParseItems( string itemstext, TOCItem parent, ArrayList arrNodes, CHMFile chmFile) { int innerTextIdx = ObjectRE.GroupNumberFromName("innerText"); int innerPTextIdx = ParamRE.GroupNumberFromName("innerText"); // get group-name indexes int nameIndex = AttributesRE.GroupNumberFromName("attributeName"); int valueIndex = AttributesRE.GroupNumberFromName("attributeValue"); int tdIndex = AttributesRE.GroupNumberFromName("attributeTD"); int nObjStartIndex = 0; while( ObjectRE.IsMatch(itemstext, nObjStartIndex) ) { Match m = ObjectRE.Match(itemstext, nObjStartIndex); string innerText = m.Groups[innerTextIdx].Value; TOCItem tocItem = new TOCItem(); tocItem.TocMode = DataMode.TextBased; tocItem.AssociatedFile = chmFile; tocItem.Parent = parent; // read parameters int nParamIndex = 0; while( ParamRE.IsMatch(innerText, nParamIndex) ) { Match mP = ParamRE.Match(innerText, nParamIndex); string innerP = mP.Groups[innerPTextIdx].Value; string paramName = ""; string paramValue = ""; int nAttrIdx = 0; while( AttributesRE.IsMatch( innerP, nAttrIdx ) ) { Match mA = AttributesRE.Match(innerP, nAttrIdx); string attributeName = mA.Groups[nameIndex].Value; string attributeValue = mA.Groups[valueIndex].Value; string attributeTD = mA.Groups[tdIndex].Value; if(attributeTD.Length > 0) { // delete the trailing textqualifier if( attributeValue.Length > 0) { int ltqi = attributeValue.LastIndexOf( attributeTD ); if(ltqi >= 0) { attributeValue = attributeValue.Substring(0,ltqi); } } } if( attributeName.ToLower() == "name") { paramName = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values } if( attributeName.ToLower() == "value") { paramValue = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values // delete trailing / while((paramValue.Length>0)&&(paramValue[paramValue.Length-1] == '/')) paramValue = paramValue.Substring(0,paramValue.Length-1); } nAttrIdx = mA.Index+mA.Length; } tocItem.Params[paramName] = paramValue; switch(paramName.ToLower()) { case "name": { tocItem.Name = paramValue; };break; case "local": { tocItem.Local = paramValue.Replace("../", "").Replace("./", ""); };break; case "imagenumber": { tocItem.ImageIndex = Int32.Parse(paramValue); tocItem.ImageIndex-=1; int nFolderAdd = 0; if((chmFile != null) && (chmFile.ImageTypeFolder)) { // get the value which should be added, to display folders instead of books if(HtmlHelpSystem.UseHH2TreePics) nFolderAdd = 8; else nFolderAdd = 4; } if(tocItem.ImageIndex%2 != 0) { if(tocItem.ImageIndex==1) tocItem.ImageIndex=0; } if(HtmlHelpSystem.UseHH2TreePics) if( tocItem.ImageIndex == 0) tocItem.ImageIndex = TOCItem.STD_FOLDER_HH2+nFolderAdd; };break; case "merge": // this item contains topics or a full TOC from a merged CHM { tocItem.MergeLink = paramValue; // "register" this item as merge-link if(_mergeItems==null) _mergeItems=new ArrayList(); _mergeItems.Add(tocItem); };break; case "type": // information type assignment for item { tocItem.InfoTypeStrings.Add( paramValue ); };break; } nParamIndex = mP.Index+mP.Length; } tocItem.ChmFile = chmFile.ChmFilePath; if(tocItem.MergeLink.Length > 0) { if(_lastTopicItem != null) { tocItem.Parent = _lastTopicItem; _lastTopicItem.Children.Add(tocItem); } else arrNodes.Add( tocItem ); } else { _lastTopicItem = tocItem; arrNodes.Add( tocItem ); } nObjStartIndex = m.Index+m.Length; } } /// /// Parses the very first <OBJECT> tag in the sitemap file and extracts /// information types and categories. /// /// text of the object tag /// CHMFile instance private static void ParseGlobalSettings(string sText, CHMFile chmFile) { int innerPTextIdx = ParamRE.GroupNumberFromName("innerText"); // get group-name indexes int nameIndex = AttributesRE.GroupNumberFromName("attributeName"); int valueIndex = AttributesRE.GroupNumberFromName("attributeValue"); int tdIndex = AttributesRE.GroupNumberFromName("attributeTD"); // read parameters int nParamIndex = 0; // 0... unknown // 1... inclusinve info type name // 2... exclusive info type name // 3... hidden info type name // 4... category name // 5... incl infotype name for category // 6... excl infotype name for category // 7... hidden infotype name for category int prevItem = 0; string sName = ""; string sDescription = ""; string curCategory = ""; while( ParamRE.IsMatch(sText, nParamIndex) ) { Match mP = ParamRE.Match(sText, nParamIndex); string innerP = mP.Groups[innerPTextIdx].Value; string paramName = ""; string paramValue = ""; int nAttrIdx = 0; while( AttributesRE.IsMatch( innerP, nAttrIdx ) ) { Match mA = AttributesRE.Match(innerP, nAttrIdx); string attributeName = mA.Groups[nameIndex].Value; string attributeValue = mA.Groups[valueIndex].Value; string attributeTD = mA.Groups[tdIndex].Value; if(attributeTD.Length > 0) { // delete the trailing textqualifier if( attributeValue.Length > 0) { int ltqi = attributeValue.LastIndexOf( attributeTD ); if(ltqi >= 0) { attributeValue = attributeValue.Substring(0,ltqi); } } } if( attributeName.ToLower() == "name") { paramName = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values } if( attributeName.ToLower() == "value") { paramValue = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values // delete trailing / while((paramValue.Length>0)&&(paramValue[paramValue.Length-1] == '/')) paramValue = paramValue.Substring(0,paramValue.Length-1); } nAttrIdx = mA.Index+mA.Length; } switch(paramName.ToLower()) { case "savetype": // inclusive information type name { prevItem = 1; sName = paramValue; };break; case "savetypedesc": // description of information type { InformationTypeMode mode = InformationTypeMode.Inclusive; sDescription = paramValue; if( prevItem == 1) mode = InformationTypeMode.Inclusive; if( prevItem == 2) mode = InformationTypeMode.Exclusive; if( prevItem == 3) mode = InformationTypeMode.Hidden; if( chmFile.GetInformationType( sName ) == null) { // check if the HtmlHelpSystem already holds such an information type if( chmFile.SystemInstance.GetInformationType( sName ) == null) { // info type not found yet InformationType newType = new InformationType(sName, sDescription, mode); chmFile.InformationTypes.Add(newType); } else { InformationType sysType = chmFile.SystemInstance.GetInformationType( sName ); chmFile.InformationTypes.Add( sysType ); } } prevItem = 0; };break; case "saveexclusive": // exclusive information type name { prevItem = 2; sName = paramValue; };break; case "savehidden": // hidden information type name { prevItem = 3; sName = paramValue; };break; case "category": // category name { prevItem = 4; sName = paramValue; curCategory = sName; };break; case "categorydesc": // category description { sDescription = paramValue; if( chmFile.GetCategory( sName ) == null) { // check if the HtmlHelpSystem already holds such a category if( chmFile.SystemInstance.GetCategory( sName ) == null) { // add category Category newCat = new Category(sName, sDescription); chmFile.Categories.Add(newCat); } else { Category sysCat = chmFile.SystemInstance.GetCategory( sName ); chmFile.Categories.Add( sysCat ); } } prevItem = 0; };break; case "type": // inclusive information type which is member of the previously read category { prevItem = 5; sName = paramValue; };break; case "typedesc": // description of type for category { sDescription = paramValue; Category cat = chmFile.GetCategory( curCategory ); if( cat != null) { // category found InformationType infoType = chmFile.GetInformationType( sName ); if( infoType != null) { if( !cat.ContainsInformationType(infoType)) { infoType.SetCategoryFlag(true); cat.AddInformationType(infoType); } } } prevItem = 0; };break; case "typeexclusive": // exclusive information type which is member of the previously read category { prevItem = 6; sName = paramValue; };break; case "typehidden": // hidden information type which is member of the previously read category { prevItem = 7; sName = paramValue; };break; default: { prevItem = 0; sName = ""; sDescription = ""; };break; } nParamIndex = mP.Index+mP.Length; } } } }