/* * IXmlReader implementation * * Copyright 2010, 2012-2013, 2016-2017 Nikolay Sivov * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA */ #define COBJMACROS #include #include #include #include "windef.h" #include "winbase.h" #include "initguid.h" #include "objbase.h" #include "xmllite.h" #include "xmllite_private.h" #ifdef __REACTOS__ #include #endif #include "wine/debug.h" #include "wine/list.h" WINE_DEFAULT_DEBUG_CHANNEL(xmllite); /* not defined in public headers */ DEFINE_GUID(IID_IXmlReaderInput, 0x0b3ccc9b, 0x9214, 0x428b, 0xa2, 0xae, 0xef, 0x3a, 0xa8, 0x71, 0xaf, 0xda); typedef enum { XmlReadInState_Initial, XmlReadInState_XmlDecl, XmlReadInState_Misc_DTD, XmlReadInState_DTD, XmlReadInState_DTD_Misc, XmlReadInState_Element, XmlReadInState_Content, XmlReadInState_MiscEnd, /* optional Misc at the end of a document */ XmlReadInState_Eof } XmlReaderInternalState; /* This state denotes where parsing was interrupted by input problem. Reader resumes parsing using this information. */ typedef enum { XmlReadResumeState_Initial, XmlReadResumeState_PITarget, XmlReadResumeState_PIBody, XmlReadResumeState_CDATA, XmlReadResumeState_Comment, XmlReadResumeState_STag, XmlReadResumeState_CharData, XmlReadResumeState_Whitespace } XmlReaderResumeState; /* saved pointer index to resume from particular input position */ typedef enum { XmlReadResume_Name, /* PITarget, name for NCName, prefix for QName */ XmlReadResume_Local, /* local for QName */ XmlReadResume_Body, /* PI body, comment text, CDATA text, CharData text */ XmlReadResume_Last } XmlReaderResume; typedef enum { StringValue_LocalName, StringValue_Prefix, StringValue_QualifiedName, StringValue_Value, StringValue_Last } XmlReaderStringValue; static const WCHAR usasciiW[] = {'U','S','-','A','S','C','I','I',0}; static const WCHAR utf16W[] = {'U','T','F','-','1','6',0}; static const WCHAR utf8W[] = {'U','T','F','-','8',0}; static const WCHAR dblquoteW[] = {'\"',0}; static const WCHAR quoteW[] = {'\'',0}; static const WCHAR ltW[] = {'<',0}; static const WCHAR gtW[] = {'>',0}; static const WCHAR commentW[] = {'<','!','-','-',0}; static const WCHAR piW[] = {'<','?',0}; BOOL is_namestartchar(WCHAR ch); static const char *debugstr_nodetype(XmlNodeType nodetype) { static const char * const type_names[] = { "None", "Element", "Attribute", "Text", "CDATA", "", "", "ProcessingInstruction", "Comment", "", "DocumentType", "", "", "Whitespace", "", "EndElement", "", "XmlDeclaration" }; if (nodetype > _XmlNodeType_Last) return wine_dbg_sprintf("unknown type=%d", nodetype); return type_names[nodetype]; } static const char *debugstr_reader_prop(XmlReaderProperty prop) { static const char * const prop_names[] = { "MultiLanguage", "ConformanceLevel", "RandomAccess", "XmlResolver", "DtdProcessing", "ReadState", "MaxElementDepth", "MaxEntityExpansion" }; if (prop > _XmlReaderProperty_Last) return wine_dbg_sprintf("unknown property=%d", prop); return prop_names[prop]; } struct xml_encoding_data { const WCHAR *name; xml_encoding enc; UINT cp; }; static const struct xml_encoding_data xml_encoding_map[] = { { usasciiW, XmlEncoding_USASCII, 20127 }, { utf16W, XmlEncoding_UTF16, 1200 }, { utf8W, XmlEncoding_UTF8, CP_UTF8 }, }; const WCHAR *get_encoding_name(xml_encoding encoding) { return xml_encoding_map[encoding].name; } xml_encoding get_encoding_from_codepage(UINT codepage) { int i; for (i = 0; i < ARRAY_SIZE(xml_encoding_map); i++) { if (xml_encoding_map[i].cp == codepage) return xml_encoding_map[i].enc; } return XmlEncoding_Unknown; } typedef struct { char *data; UINT cur; unsigned int allocated; unsigned int written; BOOL prev_cr; } encoded_buffer; typedef struct input_buffer input_buffer; typedef struct { IXmlReaderInput IXmlReaderInput_iface; LONG ref; /* reference passed on IXmlReaderInput creation, is kept when input is created */ IUnknown *input; IMalloc *imalloc; xml_encoding encoding; BOOL hint; WCHAR *baseuri; /* stream reference set after SetInput() call from reader, stored as sequential stream, cause currently optimizations possible with IStream aren't implemented */ ISequentialStream *stream; input_buffer *buffer; unsigned int pending : 1; } xmlreaderinput; static const struct IUnknownVtbl xmlreaderinputvtbl; /* Structure to hold parsed string of specific length. Reader stores node value as 'start' pointer, on request a null-terminated version of it is allocated. To init a strval variable use reader_init_strval(), to set strval as a reader value use reader_set_strval(). */ typedef struct { WCHAR *str; /* allocated null-terminated string */ UINT len; /* length in WCHARs, altered after ReadValueChunk */ UINT start; /* input position where value starts */ } strval; static WCHAR emptyW[] = {0}; static WCHAR xmlW[] = {'x','m','l',0}; static WCHAR xmlnsW[] = {'x','m','l','n','s',0}; static const strval strval_empty = { emptyW }; static const strval strval_xml = { xmlW, 3 }; static const strval strval_xmlns = { xmlnsW, 5 }; struct reader_position { UINT line_number; UINT line_position; }; enum attribute_flags { ATTRIBUTE_NS_DEFINITION = 0x1, ATTRIBUTE_DEFAULT_NS_DEFINITION = 0x2, }; struct attribute { struct list entry; strval prefix; strval localname; strval qname; strval value; struct reader_position position; unsigned int flags; }; struct element { struct list entry; strval prefix; strval localname; strval qname; struct reader_position position; }; struct ns { struct list entry; strval prefix; strval uri; struct element *element; }; typedef struct { IXmlReader IXmlReader_iface; LONG ref; xmlreaderinput *input; IMalloc *imalloc; XmlReadState state; HRESULT error; /* error set on XmlReadState_Error */ XmlReaderInternalState instate; XmlReaderResumeState resumestate; XmlNodeType nodetype; DtdProcessing dtdmode; IXmlResolver *resolver; IUnknown *mlang; struct reader_position position; struct list attrs; /* attributes list for current node */ struct attribute *attr; /* current attribute */ UINT attr_count; struct list nsdef; struct list ns; struct list elements; int chunk_read_off; strval strvalues[StringValue_Last]; UINT depth; UINT max_depth; BOOL is_empty_element; struct element empty_element; /* used for empty elements without end tag , and to keep imalloc, len); } static inline void *reader_alloc_zero(xmlreader *reader, size_t len) { void *ret = reader_alloc(reader, len); if (ret) memset(ret, 0, len); return ret; } static inline void reader_free(xmlreader *reader, void *mem) { m_free(reader->imalloc, mem); } /* Just return pointer from offset, no attempt to read more. */ static inline WCHAR *reader_get_ptr2(const xmlreader *reader, UINT offset) { encoded_buffer *buffer = &reader->input->buffer->utf16; return (WCHAR*)buffer->data + offset; } static inline WCHAR *reader_get_strptr(const xmlreader *reader, const strval *v) { return v->str ? v->str : reader_get_ptr2(reader, v->start); } static HRESULT reader_strvaldup(xmlreader *reader, const strval *src, strval *dest) { *dest = *src; if (src->str != strval_empty.str) { dest->str = reader_alloc(reader, (dest->len+1)*sizeof(WCHAR)); if (!dest->str) return E_OUTOFMEMORY; memcpy(dest->str, reader_get_strptr(reader, src), dest->len*sizeof(WCHAR)); dest->str[dest->len] = 0; dest->start = 0; } return S_OK; } /* reader input memory allocation functions */ static inline void *readerinput_alloc(xmlreaderinput *input, size_t len) { return m_alloc(input->imalloc, len); } static inline void *readerinput_realloc(xmlreaderinput *input, void *mem, size_t len) { return m_realloc(input->imalloc, mem, len); } static inline void readerinput_free(xmlreaderinput *input, void *mem) { m_free(input->imalloc, mem); } static inline WCHAR *readerinput_strdupW(xmlreaderinput *input, const WCHAR *str) { LPWSTR ret = NULL; if(str) { DWORD size; size = (lstrlenW(str)+1)*sizeof(WCHAR); ret = readerinput_alloc(input, size); if (ret) memcpy(ret, str, size); } return ret; } /* This one frees stored string value if needed */ static void reader_free_strvalued(xmlreader *reader, strval *v) { if (v->str != strval_empty.str) { reader_free(reader, v->str); *v = strval_empty; } } static void reader_clear_attrs(xmlreader *reader) { struct attribute *attr, *attr2; LIST_FOR_EACH_ENTRY_SAFE(attr, attr2, &reader->attrs, struct attribute, entry) { reader_free_strvalued(reader, &attr->localname); reader_free_strvalued(reader, &attr->value); reader_free(reader, attr); } list_init(&reader->attrs); reader->attr_count = 0; reader->attr = NULL; } /* attribute data holds pointers to buffer data, so buffer shrink is not possible while we are on a node with attributes */ static HRESULT reader_add_attr(xmlreader *reader, strval *prefix, strval *localname, strval *qname, strval *value, const struct reader_position *position, unsigned int flags) { struct attribute *attr; HRESULT hr; attr = reader_alloc(reader, sizeof(*attr)); if (!attr) return E_OUTOFMEMORY; hr = reader_strvaldup(reader, localname, &attr->localname); if (hr == S_OK) { hr = reader_strvaldup(reader, value, &attr->value); if (hr != S_OK) reader_free_strvalued(reader, &attr->value); } if (hr != S_OK) { reader_free(reader, attr); return hr; } if (prefix) attr->prefix = *prefix; else memset(&attr->prefix, 0, sizeof(attr->prefix)); attr->qname = qname ? *qname : *localname; attr->position = *position; attr->flags = flags; list_add_tail(&reader->attrs, &attr->entry); reader->attr_count++; return S_OK; } /* Returns current element, doesn't check if reader is actually positioned on it. */ static struct element *reader_get_element(xmlreader *reader) { if (reader->is_empty_element) return &reader->empty_element; return LIST_ENTRY(list_head(&reader->elements), struct element, entry); } static inline void reader_init_strvalue(UINT start, UINT len, strval *v) { v->start = start; v->len = len; v->str = NULL; } static inline const char* debug_strval(const xmlreader *reader, const strval *v) { return debugstr_wn(reader_get_strptr(reader, v), v->len); } /* used to initialize from constant string */ static inline void reader_init_cstrvalue(WCHAR *str, UINT len, strval *v) { v->start = 0; v->len = len; v->str = str; } static void reader_free_strvalue(xmlreader *reader, XmlReaderStringValue type) { reader_free_strvalued(reader, &reader->strvalues[type]); } static void reader_free_strvalues(xmlreader *reader) { int type; for (type = 0; type < StringValue_Last; type++) reader_free_strvalue(reader, type); } /* This helper should only be used to test if strings are the same, it doesn't try to sort. */ static inline int strval_eq(const xmlreader *reader, const strval *str1, const strval *str2) { if (str1->len != str2->len) return 0; return !memcmp(reader_get_strptr(reader, str1), reader_get_strptr(reader, str2), str1->len*sizeof(WCHAR)); } static void reader_clear_elements(xmlreader *reader) { struct element *elem, *elem2; LIST_FOR_EACH_ENTRY_SAFE(elem, elem2, &reader->elements, struct element, entry) { reader_free_strvalued(reader, &elem->prefix); reader_free_strvalued(reader, &elem->localname); reader_free_strvalued(reader, &elem->qname); reader_free(reader, elem); } list_init(&reader->elements); reader_free_strvalued(reader, &reader->empty_element.localname); reader_free_strvalued(reader, &reader->empty_element.qname); reader->is_empty_element = FALSE; } static struct ns *reader_lookup_ns(xmlreader *reader, const strval *prefix) { struct list *nslist = prefix ? &reader->ns : &reader->nsdef; struct ns *ns; LIST_FOR_EACH_ENTRY_REV(ns, nslist, struct ns, entry) { if (strval_eq(reader, prefix, &ns->prefix)) return ns; } return NULL; } static HRESULT reader_inc_depth(xmlreader *reader) { return (++reader->depth >= reader->max_depth && reader->max_depth) ? SC_E_MAXELEMENTDEPTH : S_OK; } static void reader_dec_depth(xmlreader *reader) { if (reader->depth) reader->depth--; } static HRESULT reader_push_ns(xmlreader *reader, const strval *prefix, const strval *uri, BOOL def) { struct ns *ns; HRESULT hr; ns = reader_alloc(reader, sizeof(*ns)); if (!ns) return E_OUTOFMEMORY; if (def) memset(&ns->prefix, 0, sizeof(ns->prefix)); else { hr = reader_strvaldup(reader, prefix, &ns->prefix); if (FAILED(hr)) { reader_free(reader, ns); return hr; } } hr = reader_strvaldup(reader, uri, &ns->uri); if (FAILED(hr)) { reader_free_strvalued(reader, &ns->prefix); reader_free(reader, ns); return hr; } ns->element = NULL; list_add_head(def ? &reader->nsdef : &reader->ns, &ns->entry); return hr; } static void reader_free_element(xmlreader *reader, struct element *element) { reader_free_strvalued(reader, &element->prefix); reader_free_strvalued(reader, &element->localname); reader_free_strvalued(reader, &element->qname); reader_free(reader, element); } static void reader_mark_ns_nodes(xmlreader *reader, struct element *element) { struct ns *ns; LIST_FOR_EACH_ENTRY(ns, &reader->ns, struct ns, entry) { if (ns->element) break; ns->element = element; } LIST_FOR_EACH_ENTRY(ns, &reader->nsdef, struct ns, entry) { if (ns->element) break; ns->element = element; } } static HRESULT reader_push_element(xmlreader *reader, strval *prefix, strval *localname, strval *qname, const struct reader_position *position) { struct element *element; HRESULT hr; element = reader_alloc_zero(reader, sizeof(*element)); if (!element) return E_OUTOFMEMORY; if ((hr = reader_strvaldup(reader, prefix, &element->prefix)) == S_OK && (hr = reader_strvaldup(reader, localname, &element->localname)) == S_OK && (hr = reader_strvaldup(reader, qname, &element->qname)) == S_OK) { list_add_head(&reader->elements, &element->entry); reader_mark_ns_nodes(reader, element); reader->is_empty_element = FALSE; element->position = *position; } else reader_free_element(reader, element); return hr; } static void reader_pop_ns_nodes(xmlreader *reader, struct element *element) { struct ns *ns, *ns2; LIST_FOR_EACH_ENTRY_SAFE_REV(ns, ns2, &reader->ns, struct ns, entry) { if (ns->element != element) break; list_remove(&ns->entry); reader_free_strvalued(reader, &ns->prefix); reader_free_strvalued(reader, &ns->uri); reader_free(reader, ns); } if (!list_empty(&reader->nsdef)) { ns = LIST_ENTRY(list_head(&reader->nsdef), struct ns, entry); if (ns->element == element) { list_remove(&ns->entry); reader_free_strvalued(reader, &ns->prefix); reader_free_strvalued(reader, &ns->uri); reader_free(reader, ns); } } } static void reader_pop_element(xmlreader *reader) { struct element *element; if (list_empty(&reader->elements)) return; element = LIST_ENTRY(list_head(&reader->elements), struct element, entry); list_remove(&element->entry); reader_pop_ns_nodes(reader, element); reader_free_element(reader, element); /* It was a root element, the rest is expected as Misc */ if (list_empty(&reader->elements)) reader->instate = XmlReadInState_MiscEnd; } /* Always make a copy, cause strings are supposed to be null terminated. Null pointer for 'value' means node value is to be determined. */ static void reader_set_strvalue(xmlreader *reader, XmlReaderStringValue type, const strval *value) { strval *v = &reader->strvalues[type]; reader_free_strvalue(reader, type); if (!value) { v->str = NULL; v->start = 0; v->len = 0; return; } if (value->str == strval_empty.str) *v = *value; else { if (type == StringValue_Value) { /* defer allocation for value string */ v->str = NULL; v->start = value->start; v->len = value->len; } else { v->str = reader_alloc(reader, (value->len + 1)*sizeof(WCHAR)); memcpy(v->str, reader_get_strptr(reader, value), value->len*sizeof(WCHAR)); v->str[value->len] = 0; v->len = value->len; } } } static inline int is_reader_pending(xmlreader *reader) { return reader->input->pending; } static HRESULT init_encoded_buffer(xmlreaderinput *input, encoded_buffer *buffer) { const int initial_len = 0x2000; buffer->data = readerinput_alloc(input, initial_len); if (!buffer->data) return E_OUTOFMEMORY; memset(buffer->data, 0, 4); buffer->cur = 0; buffer->allocated = initial_len; buffer->written = 0; buffer->prev_cr = FALSE; return S_OK; } static void free_encoded_buffer(xmlreaderinput *input, encoded_buffer *buffer) { readerinput_free(input, buffer->data); } HRESULT get_code_page(xml_encoding encoding, UINT *cp) { if (encoding == XmlEncoding_Unknown) { FIXME("unsupported encoding %d\n", encoding); return E_NOTIMPL; } *cp = xml_encoding_map[encoding].cp; return S_OK; } xml_encoding parse_encoding_name(const WCHAR *name, int len) { int min, max, n, c; if (!name) return XmlEncoding_Unknown; min = 0; max = ARRAY_SIZE(xml_encoding_map) - 1; while (min <= max) { n = (min+max)/2; if (len != -1) c = _wcsnicmp(xml_encoding_map[n].name, name, len); else c = wcsicmp(xml_encoding_map[n].name, name); if (!c) return xml_encoding_map[n].enc; if (c > 0) max = n-1; else min = n+1; } return XmlEncoding_Unknown; } static HRESULT alloc_input_buffer(xmlreaderinput *input) { input_buffer *buffer; HRESULT hr; input->buffer = NULL; buffer = readerinput_alloc(input, sizeof(*buffer)); if (!buffer) return E_OUTOFMEMORY; buffer->input = input; buffer->code_page = ~0; /* code page is unknown at this point */ hr = init_encoded_buffer(input, &buffer->utf16); if (hr != S_OK) { readerinput_free(input, buffer); return hr; } hr = init_encoded_buffer(input, &buffer->encoded); if (hr != S_OK) { free_encoded_buffer(input, &buffer->utf16); readerinput_free(input, buffer); return hr; } input->buffer = buffer; return S_OK; } static void free_input_buffer(input_buffer *buffer) { free_encoded_buffer(buffer->input, &buffer->encoded); free_encoded_buffer(buffer->input, &buffer->utf16); readerinput_free(buffer->input, buffer); } static void readerinput_release_stream(xmlreaderinput *readerinput) { if (readerinput->stream) { ISequentialStream_Release(readerinput->stream); readerinput->stream = NULL; } } /* Queries already stored interface for IStream/ISequentialStream. Interface supplied on creation will be overwritten */ static inline HRESULT readerinput_query_for_stream(xmlreaderinput *readerinput) { HRESULT hr; readerinput_release_stream(readerinput); hr = IUnknown_QueryInterface(readerinput->input, &IID_IStream, (void**)&readerinput->stream); if (hr != S_OK) hr = IUnknown_QueryInterface(readerinput->input, &IID_ISequentialStream, (void**)&readerinput->stream); return hr; } /* reads a chunk to raw buffer */ static HRESULT readerinput_growraw(xmlreaderinput *readerinput) { encoded_buffer *buffer = &readerinput->buffer->encoded; /* to make sure aligned length won't exceed allocated length */ ULONG len = buffer->allocated - buffer->written - 4; ULONG read; HRESULT hr; /* always try to get aligned to 4 bytes, so the only case we can get partially read characters is variable width encodings like UTF-8 */ len = (len + 3) & ~3; /* try to use allocated space or grow */ if (buffer->allocated - buffer->written < len) { buffer->allocated *= 2; buffer->data = readerinput_realloc(readerinput, buffer->data, buffer->allocated); len = buffer->allocated - buffer->written; } read = 0; hr = ISequentialStream_Read(readerinput->stream, buffer->data + buffer->written, len, &read); TRACE("written=%d, alloc=%d, requested=%d, read=%d, ret=0x%08x\n", buffer->written, buffer->allocated, len, read, hr); readerinput->pending = hr == E_PENDING; if (FAILED(hr)) return hr; buffer->written += read; return hr; } /* grows UTF-16 buffer so it has at least 'length' WCHAR chars free on return */ static void readerinput_grow(xmlreaderinput *readerinput, int length) { encoded_buffer *buffer = &readerinput->buffer->utf16; length *= sizeof(WCHAR); /* grow if needed, plus 4 bytes to be sure null terminator will fit in */ if (buffer->allocated < buffer->written + length + 4) { int grown_size = max(2*buffer->allocated, buffer->allocated + length); buffer->data = readerinput_realloc(readerinput, buffer->data, grown_size); buffer->allocated = grown_size; } } static inline BOOL readerinput_is_utf8(xmlreaderinput *readerinput) { static const char startA[] = {'<','?'}; static const char commentA[] = {'<','!'}; encoded_buffer *buffer = &readerinput->buffer->encoded; unsigned char *ptr = (unsigned char*)buffer->data; return !memcmp(buffer->data, startA, sizeof(startA)) || !memcmp(buffer->data, commentA, sizeof(commentA)) || /* test start byte */ (ptr[0] == '<' && ( (ptr[1] && (ptr[1] <= 0x7f)) || (buffer->data[1] >> 5) == 0x6 || /* 2 bytes */ (buffer->data[1] >> 4) == 0xe || /* 3 bytes */ (buffer->data[1] >> 3) == 0x1e) /* 4 bytes */ ); } static HRESULT readerinput_detectencoding(xmlreaderinput *readerinput, xml_encoding *enc) { encoded_buffer *buffer = &readerinput->buffer->encoded; static const char utf8bom[] = {0xef,0xbb,0xbf}; static const char utf16lebom[] = {0xff,0xfe}; WCHAR *ptrW; *enc = XmlEncoding_Unknown; if (buffer->written <= 3) { HRESULT hr = readerinput_growraw(readerinput); if (FAILED(hr)) return hr; if (buffer->written < 3) return MX_E_INPUTEND; } ptrW = (WCHAR *)buffer->data; /* try start symbols if we have enough data to do that, input buffer should contain first chunk already */ if (readerinput_is_utf8(readerinput)) *enc = XmlEncoding_UTF8; else if (*ptrW == '<') { ptrW++; if (*ptrW == '?' || *ptrW == '!' || is_namestartchar(*ptrW)) *enc = XmlEncoding_UTF16; } /* try with BOM now */ else if (!memcmp(buffer->data, utf8bom, sizeof(utf8bom))) { buffer->cur += sizeof(utf8bom); *enc = XmlEncoding_UTF8; } else if (!memcmp(buffer->data, utf16lebom, sizeof(utf16lebom))) { buffer->cur += sizeof(utf16lebom); *enc = XmlEncoding_UTF16; } return S_OK; } static int readerinput_get_utf8_convlen(xmlreaderinput *readerinput) { encoded_buffer *buffer = &readerinput->buffer->encoded; int len = buffer->written; /* complete single byte char */ if (!(buffer->data[len-1] & 0x80)) return len; /* find start byte of multibyte char */ while (--len && !(buffer->data[len] & 0xc0)) ; return len; } /* Returns byte length of complete char sequence for buffer code page, it's relative to current buffer position which is currently used for BOM handling only. */ static int readerinput_get_convlen(xmlreaderinput *readerinput) { encoded_buffer *buffer = &readerinput->buffer->encoded; int len; if (readerinput->buffer->code_page == CP_UTF8) len = readerinput_get_utf8_convlen(readerinput); else len = buffer->written; TRACE("%d\n", len - buffer->cur); return len - buffer->cur; } /* It's possible that raw buffer has some leftovers from last conversion - some char sequence that doesn't represent a full code point. Length argument should be calculated with readerinput_get_convlen(), if it's -1 it will be calculated here. */ static void readerinput_shrinkraw(xmlreaderinput *readerinput, int len) { encoded_buffer *buffer = &readerinput->buffer->encoded; if (len == -1) len = readerinput_get_convlen(readerinput); memmove(buffer->data, buffer->data + buffer->cur + (buffer->written - len), len); /* everything below cur is lost too */ buffer->written -= len + buffer->cur; /* after this point we don't need cur offset really, it's used only to mark where actual data begins when first chunk is read */ buffer->cur = 0; } static void fixup_buffer_cr(encoded_buffer *buffer, int off) { BOOL prev_cr = buffer->prev_cr; const WCHAR *src; WCHAR *dest; src = dest = (WCHAR*)buffer->data + off; while ((const char*)src < buffer->data + buffer->written) { if (*src == '\r') { *dest++ = '\n'; src++; prev_cr = TRUE; continue; } if(prev_cr && *src == '\n') src++; else *dest++ = *src++; prev_cr = FALSE; } buffer->written = (char*)dest - buffer->data; buffer->prev_cr = prev_cr; *dest = 0; } /* note that raw buffer content is kept */ static void readerinput_switchencoding(xmlreaderinput *readerinput, xml_encoding enc) { encoded_buffer *src = &readerinput->buffer->encoded; encoded_buffer *dest = &readerinput->buffer->utf16; int len, dest_len; UINT cp = ~0u; HRESULT hr; WCHAR *ptr; hr = get_code_page(enc, &cp); if (FAILED(hr)) return; readerinput->buffer->code_page = cp; len = readerinput_get_convlen(readerinput); TRACE("switching to cp %d\n", cp); /* just copy in this case */ if (enc == XmlEncoding_UTF16) { readerinput_grow(readerinput, len); memcpy(dest->data, src->data + src->cur, len); dest->written += len*sizeof(WCHAR); } else { dest_len = MultiByteToWideChar(cp, 0, src->data + src->cur, len, NULL, 0); readerinput_grow(readerinput, dest_len); ptr = (WCHAR*)dest->data; MultiByteToWideChar(cp, 0, src->data + src->cur, len, ptr, dest_len); ptr[dest_len] = 0; dest->written += dest_len*sizeof(WCHAR); } fixup_buffer_cr(dest, 0); } /* shrinks parsed data a buffer begins with */ static void reader_shrink(xmlreader *reader) { encoded_buffer *buffer = &reader->input->buffer->utf16; /* avoid to move too often using threshold shrink length */ if (buffer->cur*sizeof(WCHAR) > buffer->written / 2) { buffer->written -= buffer->cur*sizeof(WCHAR); memmove(buffer->data, (WCHAR*)buffer->data + buffer->cur, buffer->written); buffer->cur = 0; *(WCHAR*)&buffer->data[buffer->written] = 0; } } /* This is a normal way for reader to get new data converted from raw buffer to utf16 buffer. It won't attempt to shrink but will grow destination buffer if needed */ static HRESULT reader_more(xmlreader *reader) { xmlreaderinput *readerinput = reader->input; encoded_buffer *src = &readerinput->buffer->encoded; encoded_buffer *dest = &readerinput->buffer->utf16; UINT cp = readerinput->buffer->code_page; int len, dest_len, prev_len; HRESULT hr; WCHAR *ptr; /* get some raw data from stream first */ hr = readerinput_growraw(readerinput); len = readerinput_get_convlen(readerinput); prev_len = dest->written / sizeof(WCHAR); /* just copy for UTF-16 case */ if (cp == 1200) { readerinput_grow(readerinput, len); memcpy(dest->data + dest->written, src->data + src->cur, len); dest->written += len*sizeof(WCHAR); } else { dest_len = MultiByteToWideChar(cp, 0, src->data + src->cur, len, NULL, 0); readerinput_grow(readerinput, dest_len); ptr = (WCHAR*)(dest->data + dest->written); MultiByteToWideChar(cp, 0, src->data + src->cur, len, ptr, dest_len); ptr[dest_len] = 0; dest->written += dest_len*sizeof(WCHAR); /* get rid of processed data */ readerinput_shrinkraw(readerinput, len); } fixup_buffer_cr(dest, prev_len); return hr; } static inline UINT reader_get_cur(xmlreader *reader) { return reader->input->buffer->utf16.cur; } static inline WCHAR *reader_get_ptr(xmlreader *reader) { encoded_buffer *buffer = &reader->input->buffer->utf16; WCHAR *ptr = (WCHAR*)buffer->data + buffer->cur; if (!*ptr) reader_more(reader); return (WCHAR*)buffer->data + buffer->cur; } static int reader_cmp(xmlreader *reader, const WCHAR *str) { int i=0; const WCHAR *ptr = reader_get_ptr(reader); while (str[i]) { if (!ptr[i]) { reader_more(reader); ptr = reader_get_ptr(reader); } if (str[i] != ptr[i]) return ptr[i] - str[i]; i++; } return 0; } static void reader_update_position(xmlreader *reader, WCHAR ch) { if (ch == '\r') reader->position.line_position = 1; else if (ch == '\n') { reader->position.line_number++; reader->position.line_position = 1; } else reader->position.line_position++; } /* moves cursor n WCHARs forward */ static void reader_skipn(xmlreader *reader, int n) { encoded_buffer *buffer = &reader->input->buffer->utf16; const WCHAR *ptr; while (*(ptr = reader_get_ptr(reader)) && n--) { reader_update_position(reader, *ptr); buffer->cur++; } } static inline BOOL is_wchar_space(WCHAR ch) { return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'; } /* [3] S ::= (#x20 | #x9 | #xD | #xA)+ */ static int reader_skipspaces(xmlreader *reader) { const WCHAR *ptr = reader_get_ptr(reader); UINT start = reader_get_cur(reader); while (is_wchar_space(*ptr)) { reader_skipn(reader, 1); ptr = reader_get_ptr(reader); } return reader_get_cur(reader) - start; } /* [26] VersionNum ::= '1.' [0-9]+ */ static HRESULT reader_parse_versionnum(xmlreader *reader, strval *val) { static const WCHAR onedotW[] = {'1','.',0}; WCHAR *ptr, *ptr2; UINT start; if (reader_cmp(reader, onedotW)) return WC_E_XMLDECL; start = reader_get_cur(reader); /* skip "1." */ reader_skipn(reader, 2); ptr2 = ptr = reader_get_ptr(reader); while (*ptr >= '0' && *ptr <= '9') { reader_skipn(reader, 1); ptr = reader_get_ptr(reader); } if (ptr2 == ptr) return WC_E_DIGIT; reader_init_strvalue(start, reader_get_cur(reader)-start, val); TRACE("version=%s\n", debug_strval(reader, val)); return S_OK; } /* [25] Eq ::= S? '=' S? */ static HRESULT reader_parse_eq(xmlreader *reader) { static const WCHAR eqW[] = {'=',0}; reader_skipspaces(reader); if (reader_cmp(reader, eqW)) return WC_E_EQUAL; /* skip '=' */ reader_skipn(reader, 1); reader_skipspaces(reader); return S_OK; } /* [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') */ static HRESULT reader_parse_versioninfo(xmlreader *reader) { static const WCHAR versionW[] = {'v','e','r','s','i','o','n',0}; struct reader_position position; strval val, name; HRESULT hr; if (!reader_skipspaces(reader)) return WC_E_WHITESPACE; position = reader->position; if (reader_cmp(reader, versionW)) return WC_E_XMLDECL; reader_init_strvalue(reader_get_cur(reader), 7, &name); /* skip 'version' */ reader_skipn(reader, 7); hr = reader_parse_eq(reader); if (FAILED(hr)) return hr; if (reader_cmp(reader, quoteW) && reader_cmp(reader, dblquoteW)) return WC_E_QUOTE; /* skip "'"|'"' */ reader_skipn(reader, 1); hr = reader_parse_versionnum(reader, &val); if (FAILED(hr)) return hr; if (reader_cmp(reader, quoteW) && reader_cmp(reader, dblquoteW)) return WC_E_QUOTE; /* skip "'"|'"' */ reader_skipn(reader, 1); return reader_add_attr(reader, NULL, &name, NULL, &val, &position, 0); } /* ([A-Za-z0-9._] | '-') */ static inline BOOL is_wchar_encname(WCHAR ch) { return ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9') || (ch == '.') || (ch == '_') || (ch == '-')); } /* [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* */ static HRESULT reader_parse_encname(xmlreader *reader, strval *val) { WCHAR *start = reader_get_ptr(reader), *ptr; xml_encoding enc; int len; if ((*start < 'A' || *start > 'Z') && (*start < 'a' || *start > 'z')) return WC_E_ENCNAME; val->start = reader_get_cur(reader); ptr = start; while (is_wchar_encname(*++ptr)) ; len = ptr - start; enc = parse_encoding_name(start, len); TRACE("encoding name %s\n", debugstr_wn(start, len)); val->str = start; val->len = len; if (enc == XmlEncoding_Unknown) return WC_E_ENCNAME; /* skip encoding name */ reader_skipn(reader, len); return S_OK; } /* [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) */ static HRESULT reader_parse_encdecl(xmlreader *reader) { static const WCHAR encodingW[] = {'e','n','c','o','d','i','n','g',0}; struct reader_position position; strval name, val; HRESULT hr; if (!reader_skipspaces(reader)) return S_FALSE; position = reader->position; if (reader_cmp(reader, encodingW)) return S_FALSE; name.str = reader_get_ptr(reader); name.start = reader_get_cur(reader); name.len = 8; /* skip 'encoding' */ reader_skipn(reader, 8); hr = reader_parse_eq(reader); if (FAILED(hr)) return hr; if (reader_cmp(reader, quoteW) && reader_cmp(reader, dblquoteW)) return WC_E_QUOTE; /* skip "'"|'"' */ reader_skipn(reader, 1); hr = reader_parse_encname(reader, &val); if (FAILED(hr)) return hr; if (reader_cmp(reader, quoteW) && reader_cmp(reader, dblquoteW)) return WC_E_QUOTE; /* skip "'"|'"' */ reader_skipn(reader, 1); return reader_add_attr(reader, NULL, &name, NULL, &val, &position, 0); } /* [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) */ static HRESULT reader_parse_sddecl(xmlreader *reader) { static const WCHAR standaloneW[] = {'s','t','a','n','d','a','l','o','n','e',0}; static const WCHAR yesW[] = {'y','e','s',0}; static const WCHAR noW[] = {'n','o',0}; struct reader_position position; strval name, val; UINT start; HRESULT hr; if (!reader_skipspaces(reader)) return S_FALSE; position = reader->position; if (reader_cmp(reader, standaloneW)) return S_FALSE; reader_init_strvalue(reader_get_cur(reader), 10, &name); /* skip 'standalone' */ reader_skipn(reader, 10); hr = reader_parse_eq(reader); if (FAILED(hr)) return hr; if (reader_cmp(reader, quoteW) && reader_cmp(reader, dblquoteW)) return WC_E_QUOTE; /* skip "'"|'"' */ reader_skipn(reader, 1); if (reader_cmp(reader, yesW) && reader_cmp(reader, noW)) return WC_E_XMLDECL; start = reader_get_cur(reader); /* skip 'yes'|'no' */ reader_skipn(reader, reader_cmp(reader, yesW) ? 2 : 3); reader_init_strvalue(start, reader_get_cur(reader)-start, &val); TRACE("standalone=%s\n", debug_strval(reader, &val)); if (reader_cmp(reader, quoteW) && reader_cmp(reader, dblquoteW)) return WC_E_QUOTE; /* skip "'"|'"' */ reader_skipn(reader, 1); return reader_add_attr(reader, NULL, &name, NULL, &val, &position, 0); } /* [23] XMLDecl ::= '' */ static HRESULT reader_parse_xmldecl(xmlreader *reader) { static const WCHAR xmldeclW[] = {'<','?','x','m','l',' ',0}; static const WCHAR declcloseW[] = {'?','>',0}; struct reader_position position; HRESULT hr; /* check if we have "position; reader_skipn(reader, 3); hr = reader_parse_versioninfo(reader); if (FAILED(hr)) return hr; hr = reader_parse_encdecl(reader); if (FAILED(hr)) return hr; hr = reader_parse_sddecl(reader); if (FAILED(hr)) return hr; reader_skipspaces(reader); if (reader_cmp(reader, declcloseW)) return WC_E_XMLDECL; /* skip '?>' */ reader_skipn(reader, 2); reader->nodetype = XmlNodeType_XmlDeclaration; reader->empty_element.position = position; reader_set_strvalue(reader, StringValue_LocalName, &strval_xml); reader_set_strvalue(reader, StringValue_QualifiedName, &strval_xml); return S_OK; } /* [15] Comment ::= '' */ static HRESULT reader_parse_comment(xmlreader *reader) { WCHAR *ptr; UINT start; if (reader->resumestate == XmlReadResumeState_Comment) { start = reader->resume[XmlReadResume_Body]; ptr = reader_get_ptr(reader); } else { /* skip '