reactos/irc/TechBot/CHMLibrary/CHMDecoding/HHCParser.cs
Casper Hornstrup 9dab4509fa Import TechBot
svn path=/trunk/; revision=13064
2005-01-15 19:27:25 +00:00

594 lines
18 KiB
C#

using System;
using System.Collections;
using System.Text;
using System.Text.RegularExpressions;
namespace HtmlHelp.ChmDecoding
{
/// <summary>
/// The class <c>HHCParser</c> implements a parser for HHC contents files.
/// </summary>
internal sealed class HHCParser
{
/// <summary>
/// regular expressions for replacing the sitemap boundary tags
/// </summary>
private static string RE_ULOpening = @"\<ul\>"; // will be replaced by a '(' for nested parsing
private static string RE_ULClosing = @"\</ul\>"; // will be replaced by a ')' for nested parsing
/// <summary>
/// Matching ul-tags
/// </summary>
private static string RE_ULBoundaries = @"\<ul\>(?<innerText>.*)\</ul\>";
/// <summary>
/// Matching the nested tree structure.
/// </summary>
private static string RE_NestedBoundaries = @"\( (?> [^()]+ | \( (?<DEPTH>) | \) (?<-DEPTH>) )* (?(DEPTH)(?!)) \)";
/// <summary>
/// Matching object-tags
/// </summary>
private static string RE_ObjectBoundaries = @"\<object(?<innerText>.*?)\</object\>";
/// <summary>
/// Matching param tags
/// </summary>
private static string RE_ParamBoundaries = @"\<param(?<innerText>.*?)\>";
/// <summary>
/// Extracting tag attributes
/// </summary>
private const string RE_QuoteAttributes = @"( |\t)*(?<attributeName>[\-a-zA-Z0-9]*)( |\t)*=( |\t)*(?<attributeTD>[\""\'])?(?<attributeValue>.*?(?(attributeTD)\k<attributeTD>|([\s>]|.$)))";
/// <summary>
/// private regular expressionobjects
/// </summary>
private static Regex ulRE;
private static Regex NestedRE;
private static Regex ObjectRE;
private static Regex ParamRE;
private static Regex AttributesRE;
/// <summary>
/// Internal member storing the list of TOCItems which are holding merge links
/// </summary>
private static ArrayList _mergeItems = null;
/// <summary>
/// Internal member storing the last read regular topic item.
/// This is used to handle "Merge" entries and add them as child to this instance.
/// </summary>
private static TOCItem _lastTopicItem = null;
/// <summary>
/// Parses a HHC file and returns an ArrayList with the table of contents (TOC) tree
/// </summary>
/// <param name="hhcFile">string content of the hhc file</param>
/// <param name="chmFile">CHMFile instance</param>
/// <returns>Returns an ArrayList with the table of contents (TOC) tree</returns>
public static ArrayList ParseHHC(string hhcFile, CHMFile chmFile)
{
_lastTopicItem = null;
_mergeItems = null; // clear merged item list
ArrayList tocList = new ArrayList();
ulRE = new Regex(RE_ULBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
NestedRE = new Regex(RE_NestedBoundaries, RegexOptions.IgnorePatternWhitespace | RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
ObjectRE = new Regex(RE_ObjectBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
ParamRE = new Regex(RE_ParamBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
AttributesRE = new Regex(RE_QuoteAttributes, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
int innerTextIdx = ulRE.GroupNumberFromName("innerText");
if( ulRE.IsMatch(hhcFile, 0) )
{
Match m = ulRE.Match(hhcFile, 0);
int nFirstUL = 0;
nFirstUL = hhcFile.ToLower().IndexOf("<ul>");
if(nFirstUL == -1)
nFirstUL = hhcFile.ToLower().IndexOf("<il>");
if( ObjectRE.IsMatch(hhcFile, 0) ) // first object block contains information types and categories
{
Match mO = ObjectRE.Match(hhcFile, 0);
int iOTxt = ObjectRE.GroupNumberFromName("innerText");
string globalText = mO.Groups[iOTxt].Value;
if( mO.Groups[iOTxt].Index <= nFirstUL)
ParseGlobalSettings( globalText, chmFile );
}
// parse toc tree
string innerText = m.Groups["innerText"].Value;
innerText = innerText.Replace("(", "&#040;");
innerText = innerText.Replace(")", "&#041;");
innerText = Regex.Replace(innerText, RE_ULOpening, "(", RegexOptions.IgnoreCase);
innerText = Regex.Replace(innerText, RE_ULClosing, ")", RegexOptions.IgnoreCase);
ParseTree( innerText, null, tocList, chmFile );
}
return tocList;
}
/// <summary>
/// Checks if the hhc file contains a global object tag.
/// </summary>
/// <param name="hhcFile">string content of the hhc file</param>
/// <param name="chmFile">chm file</param>
/// <returns>true if the hhc content contains a global object tag</returns>
public static bool HasGlobalObjectTag(string hhcFile, CHMFile chmFile)
{
bool bRet = false;
ulRE = new Regex(RE_ULBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
ObjectRE = new Regex(RE_ObjectBoundaries, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
int innerTextIdx = ulRE.GroupNumberFromName("innerText");
if( ulRE.IsMatch(hhcFile, 0) )
{
Match m = ulRE.Match(hhcFile, 0);
int nFirstUL = 0;
nFirstUL = hhcFile.ToLower().IndexOf("<ul>");
if(nFirstUL == -1)
nFirstUL = hhcFile.ToLower().IndexOf("<il>");
if( ObjectRE.IsMatch(hhcFile, 0) ) // first object block contains information types and categories
{
Match mO = ObjectRE.Match(hhcFile, 0);
int iOTxt = ObjectRE.GroupNumberFromName("innerText");
string globalText = mO.Groups[iOTxt].Value;
if( mO.Groups[iOTxt].Index <= nFirstUL)
bRet = true;
}
}
return bRet;
}
/// <summary>
/// Gets true if the previously done parsing found merge-links
/// </summary>
public static bool HasMergeLinks
{
get
{
if(_mergeItems==null)
return false;
return _mergeItems.Count > 0;
}
}
/// <summary>
/// Gets all TOCItem references which are holding merge-links
/// </summary>
public static ArrayList MergeItems
{
get { return _mergeItems; }
}
/// <summary>
/// Recursively parses a sitemap tree
/// </summary>
/// <param name="text">content text</param>
/// <param name="parent">Parent for all read items</param>
/// <param name="arrNodes">arraylist which receives the extracted nodes</param>
/// <param name="chmFile">CHMFile instance</param>
private static void ParseTree( string text, TOCItem parent, ArrayList arrNodes, CHMFile chmFile )
{
string strPreItems="", strPostItems="";
string innerText = "";
int nIndex = 0;
while( NestedRE.IsMatch(text, nIndex) )
{
Match m = NestedRE.Match(text, nIndex);
innerText = m.Value.Substring( 1, m.Length-2);
strPreItems = text.Substring(nIndex,m.Index-nIndex);
ParseItems(strPreItems, parent, arrNodes, chmFile);
if((arrNodes.Count>0) && (innerText.Length > 0) )
{
TOCItem p = ((TOCItem)(arrNodes[arrNodes.Count-1]));
ParseTree( innerText, p, p.Children, chmFile );
}
nIndex = m.Index+m.Length;
}
if( nIndex == 0)
{
strPostItems = text.Substring(nIndex, text.Length-nIndex);
ParseItems(strPostItems, parent, arrNodes, chmFile);
}
else if( nIndex < text.Length-1)
{
strPostItems = text.Substring(nIndex, text.Length-nIndex);
ParseTree(strPostItems, parent, arrNodes, chmFile);
}
}
/// <summary>
/// Parses tree nodes from the text
/// </summary>
/// <param name="itemstext">text containing the items</param>
/// <param name="parent">Parent for all read items</param>
/// <param name="arrNodes">arraylist where the nodes should be added</param>
/// <param name="chmFile">CHMFile instance</param>
private static void ParseItems( string itemstext, TOCItem parent, ArrayList arrNodes, CHMFile chmFile)
{
int innerTextIdx = ObjectRE.GroupNumberFromName("innerText");
int innerPTextIdx = ParamRE.GroupNumberFromName("innerText");
// get group-name indexes
int nameIndex = AttributesRE.GroupNumberFromName("attributeName");
int valueIndex = AttributesRE.GroupNumberFromName("attributeValue");
int tdIndex = AttributesRE.GroupNumberFromName("attributeTD");
int nObjStartIndex = 0;
while( ObjectRE.IsMatch(itemstext, nObjStartIndex) )
{
Match m = ObjectRE.Match(itemstext, nObjStartIndex);
string innerText = m.Groups[innerTextIdx].Value;
TOCItem tocItem = new TOCItem();
tocItem.TocMode = DataMode.TextBased;
tocItem.AssociatedFile = chmFile;
tocItem.Parent = parent;
// read parameters
int nParamIndex = 0;
while( ParamRE.IsMatch(innerText, nParamIndex) )
{
Match mP = ParamRE.Match(innerText, nParamIndex);
string innerP = mP.Groups[innerPTextIdx].Value;
string paramName = "";
string paramValue = "";
int nAttrIdx = 0;
while( AttributesRE.IsMatch( innerP, nAttrIdx ) )
{
Match mA = AttributesRE.Match(innerP, nAttrIdx);
string attributeName = mA.Groups[nameIndex].Value;
string attributeValue = mA.Groups[valueIndex].Value;
string attributeTD = mA.Groups[tdIndex].Value;
if(attributeTD.Length > 0)
{
// delete the trailing textqualifier
if( attributeValue.Length > 0)
{
int ltqi = attributeValue.LastIndexOf( attributeTD );
if(ltqi >= 0)
{
attributeValue = attributeValue.Substring(0,ltqi);
}
}
}
if( attributeName.ToLower() == "name")
{
paramName = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
}
if( attributeName.ToLower() == "value")
{
paramValue = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
// delete trailing /
while((paramValue.Length>0)&&(paramValue[paramValue.Length-1] == '/'))
paramValue = paramValue.Substring(0,paramValue.Length-1);
}
nAttrIdx = mA.Index+mA.Length;
}
tocItem.Params[paramName] = paramValue;
switch(paramName.ToLower())
{
case "name":
{
tocItem.Name = paramValue;
};break;
case "local":
{
tocItem.Local = paramValue.Replace("../", "").Replace("./", "");
};break;
case "imagenumber":
{
tocItem.ImageIndex = Int32.Parse(paramValue);
tocItem.ImageIndex-=1;
int nFolderAdd = 0;
if((chmFile != null) && (chmFile.ImageTypeFolder))
{
// get the value which should be added, to display folders instead of books
if(HtmlHelpSystem.UseHH2TreePics)
nFolderAdd = 8;
else
nFolderAdd = 4;
}
if(tocItem.ImageIndex%2 != 0)
{
if(tocItem.ImageIndex==1)
tocItem.ImageIndex=0;
}
if(HtmlHelpSystem.UseHH2TreePics)
if( tocItem.ImageIndex == 0)
tocItem.ImageIndex = TOCItem.STD_FOLDER_HH2+nFolderAdd;
};break;
case "merge": // this item contains topics or a full TOC from a merged CHM
{
tocItem.MergeLink = paramValue;
// "register" this item as merge-link
if(_mergeItems==null)
_mergeItems=new ArrayList();
_mergeItems.Add(tocItem);
};break;
case "type": // information type assignment for item
{
tocItem.InfoTypeStrings.Add( paramValue );
};break;
}
nParamIndex = mP.Index+mP.Length;
}
tocItem.ChmFile = chmFile.ChmFilePath;
if(tocItem.MergeLink.Length > 0)
{
if(_lastTopicItem != null)
{
tocItem.Parent = _lastTopicItem;
_lastTopicItem.Children.Add(tocItem);
}
else
arrNodes.Add( tocItem );
}
else
{
_lastTopicItem = tocItem;
arrNodes.Add( tocItem );
}
nObjStartIndex = m.Index+m.Length;
}
}
/// <summary>
/// Parses the very first &lt;OBJECT&gt; tag in the sitemap file and extracts
/// information types and categories.
/// </summary>
/// <param name="sText">text of the object tag</param>
/// <param name="chmFile">CHMFile instance</param>
private static void ParseGlobalSettings(string sText, CHMFile chmFile)
{
int innerPTextIdx = ParamRE.GroupNumberFromName("innerText");
// get group-name indexes
int nameIndex = AttributesRE.GroupNumberFromName("attributeName");
int valueIndex = AttributesRE.GroupNumberFromName("attributeValue");
int tdIndex = AttributesRE.GroupNumberFromName("attributeTD");
// read parameters
int nParamIndex = 0;
// 0... unknown
// 1... inclusinve info type name
// 2... exclusive info type name
// 3... hidden info type name
// 4... category name
// 5... incl infotype name for category
// 6... excl infotype name for category
// 7... hidden infotype name for category
int prevItem = 0;
string sName = "";
string sDescription = "";
string curCategory = "";
while( ParamRE.IsMatch(sText, nParamIndex) )
{
Match mP = ParamRE.Match(sText, nParamIndex);
string innerP = mP.Groups[innerPTextIdx].Value;
string paramName = "";
string paramValue = "";
int nAttrIdx = 0;
while( AttributesRE.IsMatch( innerP, nAttrIdx ) )
{
Match mA = AttributesRE.Match(innerP, nAttrIdx);
string attributeName = mA.Groups[nameIndex].Value;
string attributeValue = mA.Groups[valueIndex].Value;
string attributeTD = mA.Groups[tdIndex].Value;
if(attributeTD.Length > 0)
{
// delete the trailing textqualifier
if( attributeValue.Length > 0)
{
int ltqi = attributeValue.LastIndexOf( attributeTD );
if(ltqi >= 0)
{
attributeValue = attributeValue.Substring(0,ltqi);
}
}
}
if( attributeName.ToLower() == "name")
{
paramName = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
}
if( attributeName.ToLower() == "value")
{
paramValue = HttpUtility.HtmlDecode(attributeValue); // for unicode encoded values
// delete trailing /
while((paramValue.Length>0)&&(paramValue[paramValue.Length-1] == '/'))
paramValue = paramValue.Substring(0,paramValue.Length-1);
}
nAttrIdx = mA.Index+mA.Length;
}
switch(paramName.ToLower())
{
case "savetype": // inclusive information type name
{
prevItem = 1;
sName = paramValue;
};break;
case "savetypedesc": // description of information type
{
InformationTypeMode mode = InformationTypeMode.Inclusive;
sDescription = paramValue;
if( prevItem == 1)
mode = InformationTypeMode.Inclusive;
if( prevItem == 2)
mode = InformationTypeMode.Exclusive;
if( prevItem == 3)
mode = InformationTypeMode.Hidden;
if( chmFile.GetInformationType( sName ) == null)
{
// check if the HtmlHelpSystem already holds such an information type
if( chmFile.SystemInstance.GetInformationType( sName ) == null)
{
// info type not found yet
InformationType newType = new InformationType(sName, sDescription, mode);
chmFile.InformationTypes.Add(newType);
}
else
{
InformationType sysType = chmFile.SystemInstance.GetInformationType( sName );
chmFile.InformationTypes.Add( sysType );
}
}
prevItem = 0;
};break;
case "saveexclusive": // exclusive information type name
{
prevItem = 2;
sName = paramValue;
};break;
case "savehidden": // hidden information type name
{
prevItem = 3;
sName = paramValue;
};break;
case "category": // category name
{
prevItem = 4;
sName = paramValue;
curCategory = sName;
};break;
case "categorydesc": // category description
{
sDescription = paramValue;
if( chmFile.GetCategory( sName ) == null)
{
// check if the HtmlHelpSystem already holds such a category
if( chmFile.SystemInstance.GetCategory( sName ) == null)
{
// add category
Category newCat = new Category(sName, sDescription);
chmFile.Categories.Add(newCat);
}
else
{
Category sysCat = chmFile.SystemInstance.GetCategory( sName );
chmFile.Categories.Add( sysCat );
}
}
prevItem = 0;
};break;
case "type": // inclusive information type which is member of the previously read category
{
prevItem = 5;
sName = paramValue;
};break;
case "typedesc": // description of type for category
{
sDescription = paramValue;
Category cat = chmFile.GetCategory( curCategory );
if( cat != null)
{
// category found
InformationType infoType = chmFile.GetInformationType( sName );
if( infoType != null)
{
if( !cat.ContainsInformationType(infoType))
{
infoType.SetCategoryFlag(true);
cat.AddInformationType(infoType);
}
}
}
prevItem = 0;
};break;
case "typeexclusive": // exclusive information type which is member of the previously read category
{
prevItem = 6;
sName = paramValue;
};break;
case "typehidden": // hidden information type which is member of the previously read category
{
prevItem = 7;
sName = paramValue;
};break;
default:
{
prevItem = 0;
sName = "";
sDescription = "";
};break;
}
nParamIndex = mP.Index+mP.Length;
}
}
}
}