| /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
| /* |
| * This file is part of the LibreOffice project. |
| * |
| * This Source Code Form is subject to the terms of the Mozilla Public |
| * License, v. 2.0. If a copy of the MPL was not distributed with this |
| * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| * |
| * This file incorporates work covered by the following license notice: |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed |
| * with this work for additional information regarding copyright |
| * ownership. The ASF licenses this file to you under the Apache |
| * License, Version 2.0 (the "License"); you may not use this file |
| * except in compliance with the License. You may obtain a copy of |
| * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
| */ |
| |
| #include <sal/config.h> |
| |
| #include <cassert> |
| #include <climits> |
| |
| #include <com/sun/star/container/NoSuchElementException.hpp> |
| #include <com/sun/star/uno/RuntimeException.hpp> |
| #include <osl/file.h> |
| #include <rtl/character.hxx> |
| #include <rtl/string.h> |
| #include <rtl/ustring.hxx> |
| #include <sal/log.hxx> |
| #include <sal/types.h> |
| #include <utility> |
| #include <xmlreader/pad.hxx> |
| #include <xmlreader/span.hxx> |
| #include <xmlreader/xmlreader.hxx> |
| |
| namespace xmlreader { |
| |
| namespace { |
| |
| bool isSpace(char c) { |
| switch (c) { |
| case '\x09': |
| case '\x0A': |
| case '\x0D': |
| case ' ': |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| } |
| |
| XmlReader::XmlReader(OUString fileUrl) |
| : fileUrl_(std::move(fileUrl)) |
| , fileHandle_(nullptr) |
| { |
| oslFileError e = osl_openFile( |
| fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read); |
| switch (e) |
| { |
| case osl_File_E_None: |
| break; |
| case osl_File_E_NOENT: |
| throw css::container::NoSuchElementException( fileUrl_ ); |
| default: |
| throw css::uno::RuntimeException( |
| "cannot open " + fileUrl_ + ": " + OUString::number(e)); |
| } |
| e = osl_getFileSize(fileHandle_, &fileSize_); |
| if (e == osl_File_E_None) { |
| e = osl_mapFile( |
| fileHandle_, &fileAddress_, fileSize_, 0, |
| osl_File_MapFlag_WillNeed); |
| } |
| if (e != osl_File_E_None) { |
| oslFileError e2 = osl_closeFile(fileHandle_); |
| if (e2 != osl_File_E_None) { |
| SAL_WARN( |
| "xmlreader", |
| "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2); |
| } |
| throw css::uno::RuntimeException( |
| "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" ); |
| } |
| namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace"); |
| namespaces_.emplace_back(Span("xml"), NAMESPACE_XML); |
| pos_ = static_cast< char * >(fileAddress_); |
| end_ = pos_ + fileSize_; |
| state_ = State::Content; |
| firstAttribute_ = true; |
| } |
| |
| XmlReader::~XmlReader() { |
| if (!fileHandle_) |
| return; |
| oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_); |
| if (e != osl_File_E_None) { |
| SAL_WARN( |
| "xmlreader", |
| "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e); |
| } |
| e = osl_closeFile(fileHandle_); |
| if (e != osl_File_E_None) { |
| SAL_WARN( |
| "xmlreader", |
| "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e); |
| } |
| } |
| |
| int XmlReader::registerNamespaceIri(Span const & iri) { |
| int id = toNamespaceId(namespaceIris_.size()); |
| namespaceIris_.push_back(iri); |
| if (iri == "http://www.w3.org/2001/XMLSchema-instance") { |
| // Old user layer .xcu files used the xsi namespace prefix without |
| // declaring a corresponding namespace binding, see issue 77174; reading |
| // those files during migration would fail without this hack that can be |
| // removed once migration is no longer relevant (see |
| // configmgr::Components::parseModificationLayer): |
| namespaces_.emplace_back(Span("xsi"), id); |
| } |
| return id; |
| } |
| |
| XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) |
| { |
| switch (state_) { |
| case State::Content: |
| switch (reportText) { |
| case Text::NONE: |
| return handleSkippedText(data, nsId); |
| case Text::Raw: |
| return handleRawText(data); |
| default: // Text::Normalized |
| return handleNormalizedText(data); |
| } |
| case State::StartTag: |
| return handleStartTag(nsId, data); |
| case State::EndTag: |
| return handleEndTag(); |
| case State::EmptyElementTag: |
| handleElementEnd(); |
| return Result::End; |
| default: // State::Done |
| return Result::Done; |
| } |
| } |
| |
| bool XmlReader::nextAttribute(int * nsId, Span * localName) { |
| assert(nsId != nullptr && localName != nullptr); |
| if (firstAttribute_) { |
| currentAttribute_ = attributes_.begin(); |
| firstAttribute_ = false; |
| } else { |
| ++currentAttribute_; |
| } |
| if (currentAttribute_ == attributes_.end()) { |
| return false; |
| } |
| if (currentAttribute_->nameColon == nullptr) { |
| *nsId = NAMESPACE_NONE; |
| *localName = Span( |
| currentAttribute_->nameBegin, |
| currentAttribute_->nameEnd - currentAttribute_->nameBegin); |
| } else { |
| *nsId = getNamespaceId( |
| Span( |
| currentAttribute_->nameBegin, |
| currentAttribute_->nameColon - currentAttribute_->nameBegin)); |
| *localName = Span( |
| currentAttribute_->nameColon + 1, |
| currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); |
| } |
| return true; |
| } |
| |
| Span XmlReader::getAttributeValue(bool fullyNormalize) { |
| return handleAttributeValue( |
| currentAttribute_->valueBegin, currentAttribute_->valueEnd, |
| fullyNormalize); |
| } |
| |
| int XmlReader::getNamespaceId(Span const & prefix) const { |
| auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(), |
| [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; }); |
| |
| if (i != namespaces_.rend()) |
| return i->nsId; |
| |
| return NAMESPACE_UNKNOWN; |
| } |
| |
| |
| void XmlReader::normalizeLineEnds(Span const & text) { |
| char const * p = text.begin; |
| sal_Int32 n = text.length; |
| for (;;) { |
| sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); |
| if (i < 0) { |
| break; |
| } |
| pad_.add(p, i); |
| p += i + 1; |
| n -= i + 1; |
| if (n == 0 || *p != '\x0A') { |
| pad_.add("\x0A"); |
| } |
| } |
| pad_.add(p, n); |
| } |
| |
| void XmlReader::skipSpace() { |
| while (isSpace(peek())) { |
| ++pos_; |
| } |
| } |
| |
| bool XmlReader::skipComment() { |
| if (rtl_str_shortenedCompare_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), |
| RTL_CONSTASCII_LENGTH("--")) != |
| 0) |
| { |
| return false; |
| } |
| pos_ += RTL_CONSTASCII_LENGTH("--"); |
| sal_Int32 i = rtl_str_indexOfStr_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| "premature end (within comment) of " + fileUrl_ ); |
| } |
| pos_ += i + RTL_CONSTASCII_LENGTH("--"); |
| if (read() != '>') { |
| throw css::uno::RuntimeException( |
| "illegal \"--\" within comment in " + fileUrl_ ); |
| } |
| return true; |
| } |
| |
| void XmlReader::skipProcessingInstruction() { |
| sal_Int32 i = rtl_str_indexOfStr_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| "bad '<?' in " + fileUrl_ ); |
| } |
| pos_ += i + RTL_CONSTASCII_LENGTH("?>"); |
| } |
| |
| void XmlReader::skipDocumentTypeDeclaration() { |
| // Neither is it checked that the doctypedecl is at the correct position in |
| // the document, nor that it is well-formed: |
| for (;;) { |
| char c = read(); |
| switch (c) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| "premature end (within DTD) of " + fileUrl_ ); |
| case '"': |
| case '\'': |
| { |
| sal_Int32 i = rtl_str_indexOfChar_WithLength( |
| pos_, end_ - pos_, c); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| "premature end (within DTD) of " + fileUrl_ ); |
| } |
| pos_ += i + 1; |
| } |
| break; |
| case '>': |
| return; |
| case '[': |
| for (;;) { |
| c = read(); |
| switch (c) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| "premature end (within DTD) of " + fileUrl_ ); |
| case '"': |
| case '\'': |
| { |
| sal_Int32 i = rtl_str_indexOfChar_WithLength( |
| pos_, end_ - pos_, c); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| "premature end (within DTD) of " + fileUrl_ ); |
| } |
| pos_ += i + 1; |
| } |
| break; |
| case '<': |
| switch (read()) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| "premature end (within DTD) of " + fileUrl_ ); |
| case '!': |
| skipComment(); |
| break; |
| case '?': |
| skipProcessingInstruction(); |
| break; |
| default: |
| break; |
| } |
| break; |
| case ']': |
| skipSpace(); |
| if (read() != '>') { |
| throw css::uno::RuntimeException( |
| "missing \">\" of DTD in " + fileUrl_ ); |
| } |
| return; |
| default: |
| break; |
| } |
| } |
| default: |
| break; |
| } |
| } |
| } |
| |
| Span XmlReader::scanCdataSection() { |
| if (rtl_str_shortenedCompare_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), |
| RTL_CONSTASCII_LENGTH("[CDATA[")) != |
| 0) |
| { |
| return Span(); |
| } |
| pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); |
| char const * begin = pos_; |
| sal_Int32 i = rtl_str_indexOfStr_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| "premature end (within CDATA section) of " + fileUrl_ ); |
| } |
| pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); |
| return Span(begin, i); |
| } |
| |
| bool XmlReader::scanName(char const ** nameColon) { |
| assert(nameColon != nullptr && *nameColon == nullptr); |
| for (char const * begin = pos_;; ++pos_) { |
| switch (peek()) { |
| case '\0': // i.e., EOF |
| case '\x09': |
| case '\x0A': |
| case '\x0D': |
| case ' ': |
| case '/': |
| case '=': |
| case '>': |
| return pos_ != begin; |
| case ':': |
| *nameColon = pos_; |
| break; |
| default: |
| break; |
| } |
| } |
| } |
| |
| int XmlReader::scanNamespaceIri(char const * begin, char const * end) { |
| assert(begin != nullptr && begin <= end); |
| Span iri(handleAttributeValue(begin, end, false)); |
| for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { |
| if (namespaceIris_[i] == iri) { |
| return toNamespaceId(i); |
| } |
| } |
| return XmlReader::NAMESPACE_UNKNOWN; |
| } |
| |
| char const * XmlReader::handleReference(char const * position, char const * end) |
| { |
| assert(position != nullptr && *position == '&' && position < end); |
| ++position; |
| if (*position == '#') { |
| ++position; |
| sal_uInt32 val = 0; |
| char const * p; |
| if (*position == 'x') { |
| ++position; |
| p = position; |
| for (;; ++position) { |
| char c = *position; |
| if (c >= '0' && c <= '9') { |
| val = 16 * val + (c - '0'); |
| } else if (c >= 'A' && c <= 'F') { |
| val = 16 * val + (c - 'A') + 10; |
| } else if (c >= 'a' && c <= 'f') { |
| val = 16 * val + (c - 'a') + 10; |
| } else { |
| break; |
| } |
| if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow |
| throw css::uno::RuntimeException( |
| "'&#x...' too large in " + fileUrl_ ); |
| } |
| } |
| } else { |
| p = position; |
| for (;; ++position) { |
| char c = *position; |
| if (c >= '0' && c <= '9') { |
| val = 10 * val + (c - '0'); |
| } else { |
| break; |
| } |
| if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow |
| throw css::uno::RuntimeException( |
| "'&#...' too large in " + fileUrl_ ); |
| } |
| } |
| } |
| if (position == p || *position++ != ';') { |
| throw css::uno::RuntimeException( |
| "'&#...' missing ';' in " + fileUrl_ ); |
| } |
| assert(rtl::isUnicodeCodePoint(val)); |
| if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || |
| (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) |
| { |
| throw css::uno::RuntimeException( |
| "character reference denoting invalid character in " + fileUrl_ ); |
| } |
| char buf[4]; |
| sal_Int32 len; |
| if (val < 0x80) { |
| buf[0] = static_cast< char >(val); |
| len = 1; |
| } else if (val < 0x800) { |
| buf[0] = static_cast< char >((val >> 6) | 0xC0); |
| buf[1] = static_cast< char >((val & 0x3F) | 0x80); |
| len = 2; |
| } else if (val < 0x10000) { |
| buf[0] = static_cast< char >((val >> 12) | 0xE0); |
| buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); |
| buf[2] = static_cast< char >((val & 0x3F) | 0x80); |
| len = 3; |
| } else { |
| buf[0] = static_cast< char >((val >> 18) | 0xF0); |
| buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); |
| buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); |
| buf[3] = static_cast< char >((val & 0x3F) | 0x80); |
| len = 4; |
| } |
| pad_.addEphemeral(buf, len); |
| return position; |
| } else { |
| struct EntityRef { |
| char const * inBegin; |
| sal_Int32 const inLength; |
| char const * outBegin; |
| sal_Int32 const outLength; |
| }; |
| static EntityRef const refs[] = { |
| { RTL_CONSTASCII_STRINGPARAM("amp;"), |
| RTL_CONSTASCII_STRINGPARAM("&") }, |
| { RTL_CONSTASCII_STRINGPARAM("lt;"), |
| RTL_CONSTASCII_STRINGPARAM("<") }, |
| { RTL_CONSTASCII_STRINGPARAM("gt;"), |
| RTL_CONSTASCII_STRINGPARAM(">") }, |
| { RTL_CONSTASCII_STRINGPARAM("apos;"), |
| RTL_CONSTASCII_STRINGPARAM("'") }, |
| { RTL_CONSTASCII_STRINGPARAM("quot;"), |
| RTL_CONSTASCII_STRINGPARAM("\"") } }; |
| for (const auto & ref : refs) { |
| if (rtl_str_shortenedCompare_WithLength( |
| position, end - position, ref.inBegin, ref.inLength, |
| ref.inLength) == |
| 0) |
| { |
| position += ref.inLength; |
| pad_.add(ref.outBegin, ref.outLength); |
| return position; |
| } |
| } |
| throw css::uno::RuntimeException( |
| "unknown entity reference in " + fileUrl_ ); |
| } |
| } |
| |
| Span XmlReader::handleAttributeValue( |
| char const * begin, char const * end, bool fullyNormalize) |
| { |
| pad_.clear(); |
| if (fullyNormalize) { |
| while (begin != end && isSpace(*begin)) { |
| ++begin; |
| } |
| while (end != begin && isSpace(end[-1])) { |
| --end; |
| } |
| char const * p = begin; |
| enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; |
| // a single true space character can go into the current span, |
| // everything else breaks the span |
| Space space = SPACE_NONE; |
| while (p != end) { |
| switch (*p) { |
| case '\x09': |
| case '\x0A': |
| case '\x0D': |
| switch (space) { |
| case SPACE_NONE: |
| pad_.add(begin, p - begin); |
| pad_.add(" "); |
| space = SPACE_BREAK; |
| break; |
| case SPACE_SPAN: |
| pad_.add(begin, p - begin); |
| space = SPACE_BREAK; |
| break; |
| case SPACE_BREAK: |
| break; |
| } |
| begin = ++p; |
| break; |
| case ' ': |
| switch (space) { |
| case SPACE_NONE: |
| ++p; |
| space = SPACE_SPAN; |
| break; |
| case SPACE_SPAN: |
| pad_.add(begin, p - begin); |
| begin = ++p; |
| space = SPACE_BREAK; |
| break; |
| case SPACE_BREAK: |
| begin = ++p; |
| break; |
| } |
| break; |
| case '&': |
| pad_.add(begin, p - begin); |
| p = handleReference(p, end); |
| begin = p; |
| space = SPACE_NONE; |
| break; |
| default: |
| ++p; |
| space = SPACE_NONE; |
| break; |
| } |
| } |
| pad_.add(begin, p - begin); |
| } else { |
| char const * p = begin; |
| while (p != end) { |
| switch (*p) { |
| case '\x09': |
| case '\x0A': |
| pad_.add(begin, p - begin); |
| begin = ++p; |
| pad_.add(" "); |
| break; |
| case '\x0D': |
| pad_.add(begin, p - begin); |
| ++p; |
| if (peek() == '\x0A') { |
| ++p; |
| } |
| begin = p; |
| pad_.add(" "); |
| break; |
| case '&': |
| pad_.add(begin, p - begin); |
| p = handleReference(p, end); |
| begin = p; |
| break; |
| default: |
| ++p; |
| break; |
| } |
| } |
| pad_.add(begin, p - begin); |
| } |
| return pad_.get(); |
| } |
| |
| XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { |
| assert(nsId != nullptr && localName); |
| char const * nameBegin = pos_; |
| char const * nameColon = nullptr; |
| if (!scanName(&nameColon)) { |
| throw css::uno::RuntimeException( |
| "bad tag name in " + fileUrl_ ); |
| } |
| char const * nameEnd = pos_; |
| NamespaceList::size_type inheritedNamespaces = namespaces_.size(); |
| bool hasDefaultNs = false; |
| int defaultNsId = NAMESPACE_NONE; |
| attributes_.clear(); |
| for (;;) { |
| char const * p = pos_; |
| skipSpace(); |
| if (peek() == '/' || peek() == '>') { |
| break; |
| } |
| if (pos_ == p) { |
| throw css::uno::RuntimeException( |
| "missing whitespace before attribute in " + fileUrl_ ); |
| } |
| char const * attrNameBegin = pos_; |
| char const * attrNameColon = nullptr; |
| if (!scanName(&attrNameColon)) { |
| throw css::uno::RuntimeException( |
| "bad attribute name in " + fileUrl_ ); |
| } |
| char const * attrNameEnd = pos_; |
| skipSpace(); |
| if (read() != '=') { |
| throw css::uno::RuntimeException( |
| "missing '=' in " + fileUrl_ ); |
| } |
| skipSpace(); |
| char del = read(); |
| if (del != '\'' && del != '"') { |
| throw css::uno::RuntimeException( |
| "bad attribute value in " + fileUrl_ ); |
| } |
| char const * valueBegin = pos_; |
| sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| "unterminated attribute value in " + fileUrl_ ); |
| } |
| char const * valueEnd = pos_ + i; |
| pos_ += i + 1; |
| if (attrNameColon == nullptr && |
| Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns") |
| { |
| hasDefaultNs = true; |
| defaultNsId = scanNamespaceIri(valueBegin, valueEnd); |
| } else if (attrNameColon != nullptr && |
| Span(attrNameBegin, attrNameColon - attrNameBegin) == |
| "xmlns") |
| { |
| namespaces_.emplace_back( |
| Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), |
| scanNamespaceIri(valueBegin, valueEnd)); |
| } else { |
| attributes_.emplace_back( |
| attrNameBegin, attrNameEnd, attrNameColon, valueBegin, |
| valueEnd); |
| } |
| } |
| if (!hasDefaultNs && !elements_.empty()) { |
| defaultNsId = elements_.top().defaultNamespaceId; |
| } |
| firstAttribute_ = true; |
| if (peek() == '/') { |
| state_ = State::EmptyElementTag; |
| ++pos_; |
| } else { |
| state_ = State::Content; |
| } |
| if (peek() != '>') { |
| throw css::uno::RuntimeException( |
| "missing '>' in " + fileUrl_ ); |
| } |
| ++pos_; |
| elements_.push( |
| ElementData( |
| Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, |
| defaultNsId)); |
| if (nameColon == nullptr) { |
| *nsId = defaultNsId; |
| *localName = Span(nameBegin, nameEnd - nameBegin); |
| } else { |
| *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); |
| *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); |
| } |
| return Result::Begin; |
| } |
| |
| XmlReader::Result XmlReader::handleEndTag() { |
| if (elements_.empty()) { |
| throw css::uno::RuntimeException( |
| "spurious end tag in " + fileUrl_ ); |
| } |
| char const * nameBegin = pos_; |
| char const * nameColon = nullptr; |
| if (!scanName(&nameColon) || |
| !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) |
| { |
| throw css::uno::RuntimeException( |
| "tag mismatch in " + fileUrl_ ); |
| } |
| handleElementEnd(); |
| skipSpace(); |
| if (peek() != '>') { |
| throw css::uno::RuntimeException( |
| "missing '>' in " + fileUrl_ ); |
| } |
| ++pos_; |
| return Result::End; |
| } |
| |
| void XmlReader::handleElementEnd() { |
| assert(!elements_.empty()); |
| auto end = elements_.top().inheritedNamespaces; |
| namespaces_.resize(end); |
| elements_.pop(); |
| state_ = elements_.empty() ? State::Done : State::Content; |
| } |
| |
| XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { |
| for (;;) { |
| auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_)); |
| if (!i) { |
| throw css::uno::RuntimeException( |
| "premature end of " + fileUrl_ ); |
| } |
| pos_ = i + 1; |
| switch (peek()) { |
| case '!': |
| ++pos_; |
| if (!skipComment() && !scanCdataSection().is()) { |
| skipDocumentTypeDeclaration(); |
| } |
| break; |
| case '/': |
| ++pos_; |
| return handleEndTag(); |
| case '?': |
| ++pos_; |
| skipProcessingInstruction(); |
| break; |
| default: |
| return handleStartTag(nsId, data); |
| } |
| } |
| } |
| |
| XmlReader::Result XmlReader::handleRawText(Span * text) { |
| pad_.clear(); |
| for (char const * begin = pos_;;) { |
| switch (peek()) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| "premature end of " + fileUrl_ ); |
| case '\x0D': |
| pad_.add(begin, pos_ - begin); |
| ++pos_; |
| if (peek() != '\x0A') { |
| pad_.add("\x0A"); |
| } |
| begin = pos_; |
| break; |
| case '&': |
| pad_.add(begin, pos_ - begin); |
| pos_ = handleReference(pos_, end_); |
| begin = pos_; |
| break; |
| case '<': |
| pad_.add(begin, pos_ - begin); |
| ++pos_; |
| switch (peek()) { |
| case '!': |
| ++pos_; |
| if (!skipComment()) { |
| Span cdata(scanCdataSection()); |
| if (cdata.is()) { |
| normalizeLineEnds(cdata); |
| } else { |
| skipDocumentTypeDeclaration(); |
| } |
| } |
| begin = pos_; |
| break; |
| case '/': |
| *text = pad_.get(); |
| ++pos_; |
| state_ = State::EndTag; |
| return Result::Text; |
| case '?': |
| ++pos_; |
| skipProcessingInstruction(); |
| begin = pos_; |
| break; |
| default: |
| *text = pad_.get(); |
| state_ = State::StartTag; |
| return Result::Text; |
| } |
| break; |
| default: |
| ++pos_; |
| break; |
| } |
| } |
| } |
| |
| XmlReader::Result XmlReader::handleNormalizedText(Span * text) { |
| pad_.clear(); |
| char const * flowBegin = pos_; |
| char const * flowEnd = pos_; |
| enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; |
| // a single true space character can go into the current flow, |
| // everything else breaks the flow |
| Space space = SPACE_START; |
| for (;;) { |
| switch (peek()) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| "premature end of " + fileUrl_ ); |
| case '\x09': |
| case '\x0A': |
| case '\x0D': |
| switch (space) { |
| case SPACE_START: |
| case SPACE_BREAK: |
| break; |
| case SPACE_NONE: |
| case SPACE_SPAN: |
| space = SPACE_BREAK; |
| break; |
| } |
| ++pos_; |
| break; |
| case ' ': |
| switch (space) { |
| case SPACE_START: |
| case SPACE_BREAK: |
| break; |
| case SPACE_NONE: |
| space = SPACE_SPAN; |
| break; |
| case SPACE_SPAN: |
| space = SPACE_BREAK; |
| break; |
| } |
| ++pos_; |
| break; |
| case '&': |
| switch (space) { |
| case SPACE_START: |
| break; |
| case SPACE_NONE: |
| case SPACE_SPAN: |
| pad_.add(flowBegin, pos_ - flowBegin); |
| break; |
| case SPACE_BREAK: |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| pad_.add(" "); |
| break; |
| } |
| pos_ = handleReference(pos_, end_); |
| flowBegin = pos_; |
| flowEnd = pos_; |
| space = SPACE_NONE; |
| break; |
| case '<': |
| ++pos_; |
| switch (peek()) { |
| case '!': |
| ++pos_; |
| if (skipComment()) { |
| space = SPACE_BREAK; |
| } else { |
| Span cdata(scanCdataSection()); |
| if (cdata.is()) { |
| // CDATA is not normalized (similar to character |
| // references; it keeps the code simple), but it might |
| // arguably be better to normalize it: |
| switch (space) { |
| case SPACE_START: |
| break; |
| case SPACE_NONE: |
| case SPACE_SPAN: |
| pad_.add(flowBegin, pos_ - flowBegin); |
| break; |
| case SPACE_BREAK: |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| pad_.add(" "); |
| break; |
| } |
| normalizeLineEnds(cdata); |
| flowBegin = pos_; |
| flowEnd = pos_; |
| space = SPACE_NONE; |
| } else { |
| skipDocumentTypeDeclaration(); |
| } |
| } |
| break; |
| case '/': |
| ++pos_; |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| *text = pad_.get(); |
| state_ = State::EndTag; |
| return Result::Text; |
| case '?': |
| ++pos_; |
| skipProcessingInstruction(); |
| space = SPACE_BREAK; |
| break; |
| default: |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| *text = pad_.get(); |
| state_ = State::StartTag; |
| return Result::Text; |
| } |
| break; |
| default: |
| switch (space) { |
| case SPACE_START: |
| flowBegin = pos_; |
| break; |
| case SPACE_NONE: |
| case SPACE_SPAN: |
| break; |
| case SPACE_BREAK: |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| pad_.add(" "); |
| flowBegin = pos_; |
| break; |
| } |
| flowEnd = ++pos_; |
| space = SPACE_NONE; |
| break; |
| } |
| } |
| } |
| |
| int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { |
| assert(pos <= INT_MAX); |
| return static_cast< int >(pos); |
| } |
| |
| } |
| |
| /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |