/* * Copyright (c) 2021-2025 Symas Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of the Symas Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "config.h" #include "libgcobol-fp.h" #include "ec.h" #include "common-defs.h" #include "io.h" #include "gcobolio.h" #include "libgcobol.h" #define COUNT_OF(X) (sizeof(X) / sizeof(X[0])) void sayso( const char func[], int line, int len = 0 , const unsigned char data[] = { 0} ) { if( getenv("XMLPARSE") ) { switch(len) { case 0: fprintf(stderr, "%s:%d Kilroy was here\n", func, line); break; case -1: fprintf(stderr, "%s:%d: '%s'\n", func, line, data); break; default: fprintf(stderr, "%s:%d: '%.*s'\n", func, line, len, data); break; } } } #define SAYSO() sayso(__func__, __LINE__) #define SAYSO_DATAZ(S) sayso(__func__, __LINE__, -1, S) #define SAYSO_DATA(N, S) sayso(__func__, __LINE__, N, S) #define CTX ctx __attribute__ ((unused)) struct xml_ec_value_t { int ibm_code; const char msg[80]; } xml_ec_values[] = { // Table 73. XML PARSE exceptions that allow continuation { 1, "invalid character between elements" }, { 2, "invalid start before element content" }, { 3, "duplicate attribute" }, { 4, "markup character '<' in an attribute value" }, { 5, "start/end tag mismatch" }, { 6, "invalid character in element" }, { 7, "invalid start in element content. " }, { 8, "CDATA closing character sequence ']]>' not opened" }, { 10, "comment the character sequence '--' without '>'" }, { 11, "invalid character in a processing instruction" }, { 12, "XML declaration was not start of document" }, { 13, "invalid digit in a hexadecimal character reference" }, { 14, "invalid digit in a decimal character reference" }, { 15, "encoding declaration value name must start with [a-zA-Z] character" }, { 16, "character reference did not refer to a legal XML character" }, { 17, "invalid character in an entity reference name" }, { 70, "EBCDIC document, supported EBCDIC page, unsupported declaration" }, { 71, "EBCDIC document, unsupported EBCDIC page " }, { 72, "EBCDIC document, unsupported EBCDIC page, unsupported declaration" }, { 73, "EBCDIC document, unsupported EBCDIC page and declaration " }, { 80, "ASCII document, supported ASCII page, unsupported declaration" }, { 81, "ASCII document, unsupported ASCII page " }, { 82, "ASCII document, unsupported ASCII page, unsupported declaration" }, { 83, "ASCII document, unsupported ASCII page and declaration " }, { 84, "ASCII document, invalid UTF-8, external UTF-8, no declaration. " }, { 85, "ASCII document, invalid UTF-8, external UTF-8, invalid declaration" }, { 86, "ASCII document, invalid UTF-8, external ASCII" }, { 87, "ASCII document, invalid UTF-8, external and document UTF-8" }, { 88, "ASCII document, invalid UTF-8, unsupported ASCII/UTF-8, UTF-8 declaration" }, { 89, "ASCII document, invalid UTF-8, external UTF-8, ASCII declaration" }, { 92, "alphanumeric document expected, document is UTF-16. " }, // XML PARSE exceptions that allow continuation (continued) //// 100,001 - 165,535 EBCDIC document encoding does not match code page //// 200,001 - 265,535 ASCII document encoding does not match code page // XML PARSE exceptions that do not allow continuation { 100, "end of document before start of XML declaration" }, { 101, "end of document before end of XML declaration" }, { 102, "end of document before root element" }, { 103, "end of document before version information in XML declaration" }, { 104, "end of document before version information value in XML declaration" }, { 106, "end of document before encoding declaration value in XML declaration" }, { 108, "end of document before standalone declaration value in XML declaration" }, { 109, "end of document before attribute name" }, { 110, "end of document before attribute value" }, { 111, "end of document before character/entity reference in attribute value" }, { 112, "end of document before empty element tag" }, { 113, "end of document before root element name" }, { 114, "end of document before element name" }, { 115, "end of document before character data in element content" }, { 116, "end of document before processing instruction in element content" }, { 117, "end of document before comment or CDATA section in element content" }, { 118, "end of document before comment in element content" }, { 119, "end of document before CDATA section in element content" }, { 120, "end of document before character/entity reference in element content" }, { 121, "end of document before after close of root element" }, { 122, "possible invalid start of a document type" }, { 123, "duplicate document type" }, { 124, "root element name must start with [A-Za-z_:]" }, { 125, "first attribute name must start with [A-Za-z_:]" }, { 126, "invalid character in or after element name" }, { 127, "attribute name not followed by '=' " }, { 128, "invalid attribute value delimiter" }, { 130, "attribute name must start with [A-Za-z_:]" }, { 131, "invalid character in or after attribute name" }, { 132, "empty element tag not terminated with '/>'" }, { 133, "element end tag name name must start with [A-Za-z_:]" }, { 134, "element end tag not terminated with '>'" }, { 135, "element name must start with [A-Za-z_:]" }, { 136, "invalid start of comment/CDATA in element" }, { 137, "invalid start of comment" }, { 138, "processing instruction target name must start with [A-Za-z_:]" }, { 139, "invalid character in/afterprocessing instruction target name" }, { 140, "processing instruction not terminated with '?>'" }, { 141, "invalid character following '&' in a character/entity reference" }, { 142, "missing version information in XML declaration" }, { 143, "missing '=' after 'version' in XML declaration " }, { 144, "missing XML version declaration " }, { 145, "invalid character in XML version information" }, { 146, "invalid character following XML version information value " }, { 147, "invalid attribute in XML declaration" }, { 148, "missing '=' after 'encoding' in XML declaration" }, { 149, "missing XML encoding declaration value" }, { 150, "invalid XML encoding declaration value" }, { 151, "invalid character afer XML declaration" }, { 152, "invalid attribute XML declaration" }, { 153, "missing '=' after standalone XML declaration" }, { 154, "missing standalone XML declaration value" }, { 155, "standalone declaration must be 'yes' or 'no'" }, { 156, "invalid standalone XML declaration value" }, { 157, "invalid character following XML standalone declaration value" }, { 158, "unterminated XML declaration " }, { 159, "start of document type declaration after end of root element" }, { 160, "start of element after end of root element" }, { 161, "invalid UTF-8 byte sequence" }, { 162, "UTF-8 character that has a Unicode code point above x'FFFF'" }, { 315, "UTF-16 document little-endian unsupported" }, { 316, "UCS4 document unsupported" }, { 317, "unrecognized document encoding" }, { 318, "UTF-8 document unsupported " }, { 320, "mismatched national document data item to document encoding EBCDIC" }, { 321, "mismatched national document data item to document encoding ASCII" }, { 322, "mismatched native alphanumeric document data item to document encoding EBCDIC" }, { 323, "mismatched host alphanumeric document data item to document encoding ASCII" }, { 324, "mismatched national document data item to document encoding UTF-8" }, { 325, "mismatched host alphanumeric document datat to document encoding UTF-8" }, { 500, "internal error" }, }, *eoxml_ec_values = xml_ec_values + COUNT_OF(xml_ec_values); static const xml_ec_value_t * xml_ec_value_of( int ibm_code ) { if( 100000 < ibm_code && ibm_code < 200000 ) { static xml_ec_value_t not_ebcdic{ 0, "EBCDIC document encoding " "does not match code page" }; not_ebcdic.ibm_code = ibm_code; return ¬_ebcdic; } if( 200000 < ibm_code && ibm_code < 300000 ) { static xml_ec_value_t not_ascii{ 0, "ASCII document encoding " "does not match code page" }; not_ascii.ibm_code = ibm_code; return ¬_ascii; } auto p = std::find_if( xml_ec_values, eoxml_ec_values, [ibm_code]( const auto& value ) { return ibm_code == value.ibm_code; } ); return p < eoxml_ec_values ? &*p : nullptr; } const char * xml_ec_value_str( int ibm_code ) { auto p = xml_ec_value_of(ibm_code); return p? p->msg : nullptr; } #if NEEDED static bool xml_fatal( int ibm_code ) { if( ibm_code < 100 ) return false; if( ibm_code > 100000 ) return false; assert(ibm_code < 1000); return true; } #endif static callback_t *cobol_callback; /* * Internal handler functions */ /////////////// /* ATTRIBUTE-CHARACTER The single character that corresponds with the predefined entity reference in the attribute value ATTRIBUTE-CHARACTERS The value within quotation marks or apostrophes. This can be a substring of the attribute value if the value includes an entity reference. ATTRIBUTE-NAME The attribute name; the string to the left of the equal sign ATTRIBUTE-NATIONAL-CHARACTER Regardless of the type of the XML document specified by identifier-1 in the XML PARSE statement, XML-TEXT is empty with length zero and XML-NTEXT contains the single national character that corresponds with the numeric character reference. CONTENT-CHARACTER The single character that corresponds with the predefined entity reference in the element content CONTENT-NATIONAL-CHARACTER Regardless of the type of the XML document specified by identifier-1 in the XML PARSE statement, XML-TEXT is empty with length zero and XML-NTEXT contains the single national character that corresponds with the numeric character reference.1 DOCUMENT-TYPE-DECLARATION The entire document type declaration, including the opening and closing character sequences "" ENCODING-DECLARATION The value, between quotes or apostrophes, of the encoding declaration in the XML declaration END-OF-CDATA-SECTION The string "]]>" END-OF-DOCUMENT Empty with length zero EXCEPTION The part of the document that was successfully scanned, up to and including the point at which the exception was detected.2 Special register XML-CODE contains the unique error code that identifies the exception. PROCESSING-INSTRUCTION-TARGET The processing instruction target name, which occurs immediately after the processing instruction opening sequence, "(text); __ggsr__xml_text.capacity = __ggsr__xml_text.allocated = len; __ggsr__xml_code.data = 0; cobol_callback(); } static inline void xml_event( const char event_name[], char text[] ) { xml_event(event_name, strlen(text), text); } static inline void xml_event( const char event_name[], size_t len, const xmlChar * value ) { char *text = reinterpret_cast(const_cast(value)); xml_event(event_name, len, text); } static inline void xml_event( const char event_name[], const xmlChar * value ) { char *text = reinterpret_cast(const_cast(value)); xml_event(event_name, strlen(text), text); } /* * Many static handler functions are defined but not used while we learn what * is needed. */ #pragma GCC diagnostic ignored "-Wunused-function" static void attributeDecl(void * CTX, const xmlChar * elem, const xmlChar * fullname, int type __attribute__ ((unused)), int def __attribute__ ((unused)), const xmlChar * defaultValue, xmlEnumerationPtr tree __attribute__ ((unused)) ) { fprintf(stderr, "%s:%d: elem=%s, name=%s, default=%s\n", __func__, __LINE__, elem, fullname, defaultValue); } static void cdataBlock(void * CTX, const xmlChar * data, int len) { SAYSO_DATA(len, data); xml_event("CONTENT-CHARACTERS", len, data); } static void characters(void * CTX, const xmlChar * data, int len) { SAYSO_DATA(len, data); xml_event("CONTENT-CHARACTERS", len, data); } static void comment(void * CTX, const xmlChar * value) { SAYSO_DATAZ(value); xml_event("COMMENT", value); } static void elementDecl(void * CTX, const xmlChar * name, int type __attribute__ ((unused)), xmlElementContentPtr content __attribute__ ((unused)) ) { SAYSO_DATAZ(name); } static void endDocument(void * CTX) { SAYSO(); } static void endElementNs(void * CTX, const xmlChar * localname, const xmlChar * prefix, const xmlChar * URI __attribute__ ((unused)) ) { SAYSO_DATAZ(prefix); SAYSO_DATAZ(localname); xml_event("END-OF-ELEMENT", localname); } static void endElement(void * CTX, const xmlChar * name) { SAYSO_DATAZ(name); } static void entityDecl(void * CTX, const xmlChar * name, int type __attribute__ ((unused)), const xmlChar * publicId __attribute__ ((unused)), const xmlChar * systemId __attribute__ ((unused)), xmlChar * content ) { SAYSO_DATAZ(name); SAYSO_DATAZ(content); } static void error(void * CTX, const char * msg, ...) { va_list ap; va_start (ap, msg); fprintf(stderr, "error: "); vfprintf(stderr, msg, ap); fprintf(stderr, "\n"); va_end (ap); } static void externalSubset(void * CTX, const xmlChar * name, const xmlChar * ExternalID, const xmlChar * SystemID) { SAYSO_DATAZ(name); SAYSO_DATAZ(ExternalID); SAYSO_DATAZ(SystemID); } static void fatalError(void * CTX, const char * msg, ...) { va_list ap; va_start (ap, msg); fprintf(stderr, "fatal: "); vfprintf(stderr, msg, ap); fprintf(stderr, "\n"); va_end (ap); } #if 0 static xmlEntityPtr getEntity(void * CTX, const xmlChar * name) { SAYSO_DATAZ(name); } static xmlEntityPtr getParameterEntity(void * CTX, const xmlChar * name) { SAYSO_DATAZ(name); } #endif static int hasExternalSubset(void * CTX) { SAYSO(); return 0; } static int hasInternalSubset(void * CTX) { SAYSO(); return 0; } static void ignorableWhitespace(void * CTX, const xmlChar * ch, int len) { SAYSO_DATA(len, ch); } static void internalSubset(void * CTX, const xmlChar * name, const xmlChar * ExternalID, const xmlChar * SystemID) { SAYSO_DATAZ(name); SAYSO_DATAZ(ExternalID); SAYSO_DATAZ(SystemID); } #if 0 static int isStandalone (void * CTX) { SAYSO(); } #endif static void notationDecl(void * CTX, const xmlChar * name, const xmlChar * publicId, const xmlChar * systemId) { SAYSO_DATAZ(name); SAYSO_DATAZ(publicId); SAYSO_DATAZ(systemId); } static void processingInstruction(void * CTX, const xmlChar * target, const xmlChar * data) { SAYSO_DATAZ(target); xml_event("PROCESSING-INSTRUCTION-TARGET", target); SAYSO_DATAZ(data); xml_event("PROCESSING-INSTRUCTION-DATA", data); } static void reference(void * CTX, const xmlChar * name) { SAYSO_DATAZ(name); } #if 0 static xmlParserInputPtr resolveEntity( void * CTX, const xmlChar * publicId, const xmlChar * systemId) { SAYSO(); } #endif static void setDocumentLocator(void * CTX, xmlSAXLocatorPtr loc __attribute__ ((unused)) ) { SAYSO(); } /* * Called after the XML declaration was parsed. * Use xmlCtxtGetVersion(), xmlCtxtGetDeclaredEncoding() and * xmlCtxtGetStandalone() to get data from the XML declaration. */ static void startDocument(void * CTX) { SAYSO(); } static void startElementNs(void * CTX, const xmlChar * localname, const xmlChar * prefix, const xmlChar * URI, int nb_namespaces __attribute__ ((unused)), const xmlChar ** namespaces __attribute__ ((unused)), int nb_attributes __attribute__ ((unused)), int nb_defaulted __attribute__ ((unused)), const xmlChar ** attributes __attribute__ ((unused))) { SAYSO_DATAZ(prefix); SAYSO_DATAZ(URI); SAYSO_DATAZ(localname); xml_event("START-OF-ELEMENT", localname); } static void startElement(void * CTX, const xmlChar * name, const xmlChar ** atts) { SAYSO_DATAZ(name); for( int i=0; atts[i]; i++ ) SAYSO_DATAZ(atts[i]); } static void unparsedEntityDecl(void * CTX, const xmlChar * name, const xmlChar * publicId, const xmlChar * systemId, const xmlChar * notationName) { SAYSO_DATAZ(name); SAYSO_DATAZ(publicId); SAYSO_DATAZ(systemId); SAYSO_DATAZ(notationName); } static void warning(void * CTX, const char * msg, ... ) { va_list ap; va_start (ap, msg); fprintf(stderr, "warning: "); vfprintf(stderr, msg, ap); fprintf(stderr, "\n"); va_end (ap); } /* * xmlSAXHandler is a structure of function pointers that the SAX parser calls * as it encounters XML elements in the input. Each pointer is a callback * function, locally defined in this file. These we term "handlers". * * Each handler sets the XML registers per IBM, and then calls * cobol_callback(), which is a function pointer supplied by the COBOL program * to be the processing procedure for XML PARSE. * * There is no obvious way to abort parsing at the C level. See: * http://veillard.com/XML/messages/0540.html * * > The simplest to implement this would not be to add a new SAX * > callback but rather modify the xmlParserCtxtPtr passed to the * > callbacks. The best seems to be: * > - set ctxt->instate to XML_PARSER_EOF * > - hack xmlCurrentChar() to return 0 * > if (ctxt->instate == XML_PARSER_EOF) * > Doing both should led to a quick termination of parsing * > (but endElement(s)/endDocument will certainly be called anyway). * * Another hack might be to set the input to all blanks in cobol_callback. */ static xmlSAXHandler handlers; static void initialize_handlers( callback_t *callback ) { handlers = xmlSAXHandler {}; handlers.initialized = XML_SAX2_MAGIC; cobol_callback = callback; #if 0 //// Should typically not be modified handlers.attributeDecl = attributeDecl; handlers.elementDecl = elementDecl; handlers.entityDecl = entityDecl; handlers.externalSubset = externalSubset; handlers.getEntity = getEntity; handlers.getParameterEntity = getParameterEntity; handlers.internalSubset = internalSubset; handlers.notationDecl = notationDecl; handlers.resolveEntity = resolveEntity; handlers.unparsedEntityDecl = unparsedEntityDecl; //// Not supposed to be changed by applications handlers.hasExternalSubset = hasExternalSubset; handlers.hasInternalSubset = hasInternalSubset; handlers.isStandalone = isStandalone; //// SAX 1 only handlers.startElement = startElement; handlers.endElement = endElement; //// Everything is available on the context, so this is useless in our case handlers.setDocumentLocator = setDocumentLocator; #endif handlers.cdataBlock = cdataBlock; handlers.characters = characters; handlers.comment = comment; handlers.endDocument = endDocument; handlers.endElementNs = endElementNs; handlers.ignorableWhitespace = ignorableWhitespace; handlers.processingInstruction = processingInstruction; handlers.reference = reference; handlers.startDocument = startDocument; handlers.startElementNs = startElementNs; handlers.error = error; handlers.fatalError = fatalError; handlers.warning = warning; } static xmlChar * xmlchar_of( const char input[] ) { return const_cast( reinterpret_cast(input) ); } static const char * xmlParserErrors_str( xmlParserErrors erc, const char name[] ) { const char *msg = "???"; switch( erc ) { case XML_ERR_OK: msg = "Success"; break; case XML_ERR_INTERNAL_ERROR: msg = "Internal assertion failure"; break; case XML_ERR_NO_MEMORY: msg = "Out of memory"; break; case XML_ERR_UNSUPPORTED_ENCODING: msg = "Unsupported character encoding"; break; #if LIBXML_VERSION >= 21400 case XML_ERR_RESOURCE_LIMIT: msg = "Internal resource limit like maximum amplification factor exceeded"; break; case XML_ERR_ARGUMENT: msg = "Invalid argument"; break; case XML_ERR_SYSTEM: msg = "Unexpected error from the OS or an external library"; break; #endif case XML_IO_ENOENT: msg = "File not found"; break; default: msg = strdup(name); if( ! msg ) msg = "unknown XML error"; break; } return msg; } #define xmlerror_str(E) xmlParserErrors_str( (E), #E ) /* * The global context is NULL if XML PARSE is not in progress. */ static class context_t { const int priority; public: xmlParserCtxt * ctxt; context_t() : priority(LOG_INFO), ctxt(nullptr) { const int option = LOG_PERROR, facility = LOG_USER; #if HAVE_DECL_PROGRAM_INVOCATION_SHORT_NAME /* Declared in errno.h, when available. */ static const char * const ident = program_invocation_short_name; #elif defined (HAVE_GETPROGNAME) /* Declared in stdlib.h. */ static const char * const ident = getprogname(); #else /* Avoid a NULL entry. */ static const char * const ident = "unnamed_COBOL_program"; #endif // TODO: Program to set option in library via command-line and/or // environment. // Library listens to program, not to the environment. openlog(ident, option, facility); initialize_handlers(nullptr); } void push( const cblc_field_t *input_field, size_t input_offset, size_t len, bool done ) { if( ! ctxt ) { init(); } assert(cobol_callback); // caller must set if( input_offset < len ) { int size = len - input_offset; const char *chunk = PTRCAST(char, input_field->data + input_offset); int terminate = done? 1 : 0; auto erc = (xmlParserErrors )xmlParseChunk( ctxt, chunk, size, terminate ); if( erc != 0 ) { auto msg = xmlerror_str(erc); syslog(priority, "XML PARSE: XML error: %s", msg); } if( done ) this->done(); } } void done() { if( ctxt ) { xmlFreeParserCtxt( ctxt ); ctxt = nullptr; } } protected: void init() { const char *external_entities = nullptr; void * const user_data = nullptr; ctxt = xmlCreatePushParserCtxt( &handlers, user_data, nullptr, 0, external_entities); } } context; static int xml_push_parse( const cblc_field_t *input_field, size_t input_offset, size_t len, cblc_field_t *encoding __attribute__ ((unused)), cblc_field_t *validating __attribute__ ((unused)), int returns_national __attribute__ ((unused)), void (*callback)(void) ) { ::cobol_callback = callback; context.push( input_field, input_offset, len, false); #if LIBXML_VERSION >= 21400 const xmlChar * version = xmlCtxtGetVersion( context.ctxt ); #else const xmlChar * version = xmlchar_of("requires version 2.14"); #endif assert(version); assert(nullptr == "function not ready and not called"); return 0; } extern "C" // Parser calls via parser_xml_parse_end, probabably. int __gg__xml_parse_done() { context.done(); return 0; } extern "C" int __gg__xml_parse( const cblc_field_t *input_field, size_t input_offset, size_t len, cblc_field_t *encoding __attribute__ ((unused)), cblc_field_t *validating __attribute__ ((unused)), int returns_national __attribute__ ((unused)), void (*callback)(void) ) { initialize_handlers(callback); const char *input = PTRCAST(char, input_field->data + input_offset); int erc = xmlSAXUserParseMemory(&handlers, nullptr, input, len); if( erc ) { const xmlError *msg = xmlCtxtGetLastError(nullptr); fprintf(stderr, "XML PARSE: error: line %d: %s (%d: %d.%d.%d)\n", msg->line, msg->message, erc, msg->domain, msg->level, msg->code); } return erc; }