From: Robert Dubner Date: Tue, 21 Oct 2025 17:33:30 +0000 (-0400) Subject: cobol: Implement the XML PARSE statement. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b20c6458fa0a9e78253052f0493e921f75641828;p=thirdparty%2Fgcc.git cobol: Implement the XML PARSE statement. These changes implement the XML PARSE statement as described in the IBM specification. A repair to exception handling is included. Up until now, an exception after a successful file operation wasn't handled properly. A repair to value declarations for BINARY / COMP / COMP-4 / COMP-5 values now allows them to have digits to the right of the implied decimal point. Processing of the "S" PICTURE character has been normalized as well. Co-Authored-By: James K. Lowden Co-Authored-By: Robert Dubner gcc/cobol/ChangeLog: * Make-lang.in: Incorporate new token_names.h file. * cdf.y: Modify tokens. * gcobol.1: Document XML PARSE statement * genapi.cc (parser_enter_program): Verify that every goto has a matching label. (parser_end_program): Likewise. (parser_alphabet): Refine handling codeset encodings. (parser_alphabet_use): Likewise. (label_fetch): Moved from later in the source code. (parser_xml_parse): New routine for XML PARSE. (parser_xml_on_exception): Likewise. (parser_xml_not_exception): Likewise. (parser_xml_end): Likewise. (parser_label_label): Verify goto/label matching. (parser_label_goto): Likewise. (parser_entry): Minor change to SHOW_PARSE report. * genapi.h (parser_alphabet): Set parameter to const. (parser_xml_parse): Declare new function. (parser_xml_on_exception): Likewise. (parser_xml_not_exception): Likewise. (parser_xml_end): Likewise. (parser_label_addr): Likewise. * parse.y: label_pair_t structure; locale processing; new token processing for alphabets and XML PARSE. * parse_ante.h (name_of): Return field->name when initial is NULL. (new_tempnumeric): Make signable_e optional. (ast_save_locale): New function. (data_division_ready): Warning for "no alphabet". * scan.l: Repair interpretation of BINARY, COMP, COMP-4, and COMP-5. * scan_ante.h (struct bint_t): Likewise. * scan_post.h (current_tokens_t::tokenset_t::tokenset_t): Include token_names.h. * symbols.cc (symbols_alphabet_set): Revert to prior alphabet determination. (symbol_table_init): New XML special registers. (new_temporary): Make signable_e controllable, not fixed. * symbols.h (__gg__encoding_iconv_valid): New declaration. (enum cbl_label_type_t): New LblXml label type. (struct cbl_xml_parse_t): (struct cbl_label_t): Implement XML PARSE. (new_temporary): Incorporate boolean for signable_e. (symbol_elem_of): Change label field type handling. (cbl_section_of): Likewise. (cbl_field_of): Likewise. (cbl_label_of): Likewise. (cbl_special_name_of): Likewise. (cbl_alphabet_of): Likewise. (cbl_file_of): Likewise. * token_names.h: New file. * util.cc (gcc_location_set_impl): Improve location_t calculations when entering and leaving COPYBOOKs. libgcobol/ChangeLog: * Makefile.am: Changes for XML PARSE and POSIX functions. * Makefile.in: Likewise. * charmaps.cc: Augment encodings[] table with "supported" boolean. (__gg__encoding_iconv_name): Modify how encodings are identified. (encoding_descr): Likewise. (__gg__encoding_iconv_valid): Likewise. * common-defs.h (callback_t): Define function pointer. * constants.cc: Use named cbl_attr_e constants instead of magic numbers.; New definitions for XML special registers. * encodings.h (struct encodings_t): Declare "supported" boolean. * libgcobol.cc (format_for_display_internal): Use std::ptrdiff_t. (__gg__alphabet_use): Add case for iconv_CP1252_e. (default_exception_handler): Repair exception handling after a successful file operation. * posix/errno.cc: New file. * posix/localtime.cc: New file. * posix/stat.cc: New file. * posix/stat.h: New file. * posix/tm.h: New file. * xmlparse.cc: New file to support XML PARSE statement. gcc/testsuite/ChangeLog: * cobol.dg/typo-1.cob: New test for squiggles and carets. --- diff --git a/gcc/cobol/Make-lang.in b/gcc/cobol/Make-lang.in index 0e2a773d4df..1f9995febf1 100644 --- a/gcc/cobol/Make-lang.in +++ b/gcc/cobol/Make-lang.in @@ -225,6 +225,7 @@ cobol/scan.o: cobol/scan.cc \ $(srcdir)/cobol/scan_post.h \ $(srcdir)/cobol/symbols.h \ $(srcdir)/cobol/util.h \ + $(srcdir)/cobol/token_names.h \ $(srcdir)/hwint.h \ $(srcdir)/system.h \ $(srcdir)/../include/ansidecl.h \ @@ -241,6 +242,15 @@ cobol/scan.o: cobol/scan.cc \ cobol/cdf.cc \ cobol/parse.cc +# Update token names if the generator script is installed +# (by a developer) and there's been a change. +$(srcdir)/cobol/token_names.h: cobol/parse.cc + if [ -f $@.gen ]; then \ + $@.gen $(subst .cc,.h,$^) \ + | diff -u $@ - \ + | patch -t --set-time $@ ; \ + fi + # # The src targets are executed if # ‘--enable-generated-files-in-srcdir’ was specified as a configure diff --git a/gcc/cobol/cdf.y b/gcc/cobol/cdf.y index f01c8f6848a..f72ed77c964 100644 --- a/gcc/cobol/cdf.y +++ b/gcc/cobol/cdf.y @@ -244,21 +244,21 @@ apply_cdf_turn( const exception_turn_t& turn ) { %type DEFINED %token OTHER 699 PARAMETER_kw 369 "PARAMETER" %token OFF 688 OVERRIDE 370 -%token THRU 952 -%token TRUE_kw 815 "True" +%token THRU 949 +%token TRUE_kw 814 "True" %token CALL_COBOL 393 "CALL" %token CALL_VERBATIM 394 "CALL (as C)" -%token TURN 817 CHECKING 497 LOCATION 650 ON 690 WITH 844 +%token TURN 816 CHECKING 497 LOCATION 650 ON 690 WITH 843 -%left OR 953 -%left AND 954 -%right NOT 955 -%left '<' '>' '=' NE 956 LE 957 GE 958 +%left OR 950 +%left AND 951 +%right NOT 952 +%left '<' '>' '=' NE 953 LE 954 GE 955 %left '-' '+' %left '*' '/' -%right NEG 960 +%right NEG 957 %define api.prefix {ydf} %define api.token.prefix{YDF_} diff --git a/gcc/cobol/gcobol.1 b/gcc/cobol/gcobol.1 index 9ea9bfd1478..0de86dff623 100644 --- a/gcc/cobol/gcobol.1 +++ b/gcc/cobol/gcobol.1 @@ -778,6 +778,30 @@ resolution of .Ar filename is deferred until runtime, when the name must appear in the program's environment. +.Ss XML PARSE +.Nm +emulates the IBM +.Sy "XML PARSE" +statement. The following values for +.Sy XML-EVENT +are defined: +.Bl -tag -compact +.It Sy COMMENT +Text of a comment between "" +.It Sy CONTENT-CHARACTERS +Some or all of the character content of the element between start and end tags. +.It Sy END-OF-ELEMENT +End-element tag, with name if present in the input. +.It Sy PROCESSING-INSTRUCTION-DATA +Processing instruction (after the target name), excluding "?>". +.It Sy PROCESSING-INSTRUCTION-TARGET +The processing instruction target name appears in +.Sy XML-TEXT +or +.Sy XML-NTEXT . +.It Sy START-OF-ELEMENT +Name of the start element tag or empty element tag. +.El . .Sh ISO \*[lang] Implementation Status .Ss USAGE Data Types @@ -1480,6 +1504,18 @@ error. This feature is meant to help diagnose mysterious copybook errors. .El . +.Ss Variables for Developers +.Bl -tag -compact +.It Ev GCOBOL_SHOW +produces a trace of the internal calls made by the parser to prepare +the GENERIC tree. +.It Ev GCOBOL_TRACE +used at compile time, produces an executable that traces the +execution, mapping it back the same code-creation functions as +.Ev GCOBOL_SHOW , +as well as the values of data items and branch conditions. +.El +. .Sh FILES Executables produced by .Nm diff --git a/gcc/cobol/genapi.cc b/gcc/cobol/genapi.cc index 6fc4770ca26..4a880c3864b 100644 --- a/gcc/cobol/genapi.cc +++ b/gcc/cobol/genapi.cc @@ -3988,6 +3988,37 @@ parser_enter_program( const char *funcname_, free(funcname); } +static class label_verify_t { + std::set lain, dangling; + static inline size_t index_of( const cbl_label_t *label ) { + return symbol_index(symbol_elem_of(label)); + } +public: + void go_to( const cbl_label_t *label ) { + auto p = lain.find(index_of(label)); + if( p == lain.end() ) { + dangling.insert(index_of(label)); + } + } + bool lay( const cbl_label_t *label ) { + auto ok = lain.insert(index_of(label)); + if( ok.second ) { + dangling.erase(index_of(label)); + } + return true; + } + bool vet() const { // be always agreeable, for now. + return dangling.empty(); + } + void dump() const { + fprintf(stderr, "%u nonexistent labels called\n", unsigned(dangling.size()) ); + for( auto sym : dangling ) { + auto label = cbl_label_of(symbol_at(sym)); + fprintf(stderr, "\t %s\n", label->name); + } + } +} label_verify; + void parser_end_program(const char *prog_name ) { @@ -4014,6 +4045,13 @@ parser_end_program(const char *prog_name ) TRACE1_END } + if( ! label_verify.vet() ) + { + label_verify.dump(); + gcc_unreachable(); + } + + if( gg_trans_unit.function_stack.size() ) { // The body has been created by various parser calls. It's time @@ -5035,7 +5073,7 @@ parser_accept_date_hhmmssff( struct cbl_field_t *target ) */ void -parser_alphabet( cbl_alphabet_t& alphabet ) +parser_alphabet( const cbl_alphabet_t& alphabet ) { Analyze(); SHOW_PARSE @@ -5046,6 +5084,9 @@ parser_alphabet( cbl_alphabet_t& alphabet ) free(psz); switch(alphabet.encoding) { + case iconv_CP1252_e: + psz = xasprintf("CP1252"); + break; case ASCII_e: psz = xasprintf("ASCII"); break; @@ -5074,6 +5115,7 @@ parser_alphabet( cbl_alphabet_t& alphabet ) switch(alphabet.encoding) { + case iconv_CP1252_e: case ASCII_e: case iso646_e: case EBCDIC_e: @@ -5114,6 +5156,9 @@ parser_alphabet( cbl_alphabet_t& alphabet ) break; } default: + fprintf(stderr, "%s: Program ID %s:\n", + cobol_filename(), + cbl_label_of(symbol_at(current_program_index()))->name); gcc_unreachable(); } } @@ -5130,6 +5175,9 @@ parser_alphabet_use( cbl_alphabet_t& alphabet ) free(psz); switch(alphabet.encoding) { + case iconv_CP1252_e: + psz = xasprintf("CP1252"); + break; case ASCII_e: psz = xasprintf("ASCII"); break; @@ -5159,6 +5207,7 @@ parser_alphabet_use( cbl_alphabet_t& alphabet ) { default: gcc_unreachable(); + case iconv_CP1252_e: case ASCII_e: case iso646_e: case EBCDIC_e: @@ -6802,6 +6851,160 @@ parser_free( size_t n, cbl_refer_t refers[] ) } } +static +cbl_label_addresses_t * +label_fetch(struct cbl_label_t *label) + { + if( !label->structs.goto_trees ) + { + label->structs.goto_trees + = static_cast + (xmalloc(sizeof(struct cbl_label_addresses_t))); + gcc_assert(label->structs.goto_trees); + + gg_create_goto_pair(&label->structs.goto_trees->go_to, + &label->structs.goto_trees->label); + } + return label->structs.goto_trees; + } + +void +parser_xml_parse( cbl_label_t *instance, + cbl_refer_t input, + cbl_field_t *encoding, + cbl_field_t *validating, + bool returns_national, + cbl_label_t *from_proc, + cbl_label_t *to_proc ) + { + SHOW_PARSE + { + SHOW_PARSE_HEADER + SHOW_PARSE_LABEL("", instance) + SHOW_PARSE_REF(" ", input) + SHOW_PARSE_END + } + + TRACE1 + { + TRACE1_HEADER + TRACE1_END + } + + // We know that this routine comes first in the sequence, so we can + // create the goto/label pairs here: + + instance->structs.xml_parse = static_cast + (xmalloc(sizeof(struct cbl_xml_parse_t))); + gcc_assert(instance->structs.xml_parse); + + gg_create_goto_pair(&instance->structs.xml_parse->over.go_to, + &instance->structs.xml_parse->over.label); + gg_create_goto_pair(&instance->structs.xml_parse->exception.go_to, + &instance->structs.xml_parse->exception.label); + gg_create_goto_pair(&instance->structs.xml_parse->no_exception.go_to, + &instance->structs.xml_parse->no_exception.label); + + // We need to create a COBOL ENTRY point into this function. That entry + // point will be used by __gg__xml_parse to perform from_proc through to_proc + // as part of processing the libxml2 callbacks. + + char ach[64]; + static int instance_counter = 1; + sprintf(ach, + "_%s_xml_callback_%d", + current_function->our_name, + instance_counter++); + + cbl_field_t for_entry = {}; + for_entry.type = FldAlphanumeric; + for_entry.data.capacity = strlen(ach); + for_entry.data.initial = ach; + for_entry.codeset.encoding = iconv_CP1252_e; + + // build an island for the callback: + tree island_goto; + tree island_label; + gg_create_goto_pair(&island_goto, + &island_label); + + gg_append_statement(island_goto); + // This creates the separate _xml_callback function + parser_entry(&for_entry, 0, nullptr); + // When invoked, the callback performs the processing procedures + parser_perform(from_proc, to_proc); + // And then returns back to the caller + gg_return(0); + gg_append_statement(island_label); + + // With the callback in place, we are ready to call the library: + tree pcallback = gg_get_function_address(VOID, ach); + + tree erc = gg_define_int(); + gg_assign(erc, gg_call_expr(INT, + "__gg__xml_parse", + gg_get_address_of(input.field->var_decl_node), + refer_offset(input), + refer_size_source(input), + encoding ? + gg_get_address_of(encoding->var_decl_node) + : null_pointer_node, + validating ? + gg_get_address_of(validating->var_decl_node) + : null_pointer_node, + build_int_cst_type(INT, returns_national), + pcallback, + NULL_TREE)); + IF( erc, ne_op, integer_zero_node ) + { + //gg_printf("__gg__xml_parse() failed with erc %d\n", erc, NULL_TREE); + gg_append_statement(instance->structs.xml_parse->exception.go_to); + } + ELSE + { + //gg_printf("__gg__xml_parse() apparently succeeded\n", NULL_TREE); + gg_append_statement(instance->structs.xml_parse->no_exception.go_to); + } + ENDIF + } + +void +parser_xml_on_exception( cbl_label_t *instance ) + { + SHOW_PARSE + { + SHOW_PARSE_HEADER + SHOW_PARSE_LABEL(" ", instance) + SHOW_PARSE_END + } + gg_append_statement(instance->structs.xml_parse->over.go_to); + gg_append_statement(instance->structs.xml_parse->exception.label); + } + +void +parser_xml_not_exception( cbl_label_t *instance ) +{ + SHOW_PARSE + { + SHOW_PARSE_HEADER + SHOW_PARSE_LABEL(" ", instance) + SHOW_PARSE_END + } + gg_append_statement(instance->structs.xml_parse->over.go_to); + gg_append_statement(instance->structs.xml_parse->no_exception.label); + } + +void parser_xml_end( cbl_label_t *instance ) + { + SHOW_PARSE + { + SHOW_PARSE_HEADER + SHOW_PARSE_LABEL(" ", instance) + SHOW_PARSE_END + } + gg_append_statement(instance->structs.xml_parse->over.label); + } + void parser_arith_error(cbl_label_t *arithmetic_label) { @@ -7962,23 +8165,6 @@ parser_see_stop_run(struct cbl_refer_t exit_status, gg_exit(returned_value); } -static -cbl_label_addresses_t * -label_fetch(struct cbl_label_t *label) - { - if( !label->structs.goto_trees ) - { - label->structs.goto_trees - = static_cast - (xmalloc(sizeof(struct cbl_label_addresses_t))); - gcc_assert(label->structs.goto_trees); - - gg_create_goto_pair(&label->structs.goto_trees->go_to, - &label->structs.goto_trees->label); - } - return label->structs.goto_trees; - } - void parser_label_label(struct cbl_label_t *label) { @@ -8009,6 +8195,12 @@ parser_label_label(struct cbl_label_t *label) } CHECK_LABEL(label); + + if( ! label_verify.lay(label) ) + { + yywarn("%s: label %qs already exists", __func__, label->name); + gcc_unreachable(); + } if(strcmp(label->name, "_end_declaratives") == 0 ) { @@ -8048,6 +8240,8 @@ parser_label_goto(struct cbl_label_t *label) } CHECK_LABEL(label); + + label_verify.go_to(label); if( strcmp(label->name, "_end_declaratives") == 0 ) { @@ -13525,7 +13719,8 @@ parser_entry( const cbl_field_t *name, size_t nusing, cbl_ffi_arg_t *args ) SHOW_PARSE { SHOW_PARSE_HEADER - SHOW_PARSE_FIELD( " ENTRY ", name) + SHOW_PARSE_TEXT(" ") + SHOW_PARSE_TEXT(name->data.initial) SHOW_PARSE_END } diff --git a/gcc/cobol/genapi.h b/gcc/cobol/genapi.h index 1aafc651a2b..6582d2e8898 100644 --- a/gcc/cobol/genapi.h +++ b/gcc/cobol/genapi.h @@ -81,7 +81,7 @@ void parser_accept_date_dow( cbl_field_t *tgt ); void parser_accept_date_hhmmssff( cbl_field_t *tgt ); void -parser_alphabet( cbl_alphabet_t& alphabet ); +parser_alphabet( const cbl_alphabet_t& alphabet ); void parser_alphabet_use( cbl_alphabet_t& alphabet ); @@ -90,6 +90,18 @@ parser_allocate( cbl_refer_t size_or_based, cbl_refer_t returning, bool initiali void parser_free( size_t n, cbl_refer_t refers[] ); +void parser_xml_parse( cbl_label_t *stmt, + cbl_refer_t input, + cbl_field_t *encoding, + cbl_field_t *validating, + bool returns_national, + cbl_label_t *from_proc, + cbl_label_t *to_proc ); + +void parser_xml_on_exception( cbl_label_t *name ); +void parser_xml_not_exception( cbl_label_t *name ); +void parser_xml_end( cbl_label_t *name ); + void parser_add( size_t nC, cbl_num_result_t *C, size_t nA, cbl_refer_t *A, @@ -322,6 +334,9 @@ parser_label_label( struct cbl_label_t *label ); void parser_label_goto( struct cbl_label_t *label ); +callback_t * +parser_label_addr( struct cbl_label_t *label ); + void parser_goto( cbl_refer_t value, size_t narg, cbl_label_t * const labels[] ); diff --git a/gcc/cobol/parse.y b/gcc/cobol/parse.y index c497b8f12f7..d0e0c3f582a 100644 --- a/gcc/cobol/parse.y +++ b/gcc/cobol/parse.y @@ -55,6 +55,41 @@ const char *alpha, *national; }; + struct label_pair_t { + cbl_label_t *from, *to; + }; + +class locale_tgt_t { + char user_system_default; + std::vector categories; + public: + locale_tgt_t() : user_system_default('\0') {} + locale_tgt_t( int category ) + : user_system_default('\0') + , categories(1, category) + {} + locale_tgt_t operator=( int ch ) { + assert(categories.empty()); + switch(ch) { + case 'S': case 'U': + user_system_default = ch; + return *this; + } + gcc_unreachable(); + } + locale_tgt_t push_back( int token ) { + categories.push_back(token); + return *this; + } + + bool is_default() const { return 0 < user_system_default; } + char default_of() const { + assert(categories.empty()); + return user_system_default; + } + const std::vector& lc_categories() const { return categories; } +}; + class literal_t { size_t isym; public: @@ -65,9 +100,7 @@ bool empty() const { return data == NULL; } size_t isymbol() const { return isym; } - const char * symbol_name() const { - return isym? cbl_field_of(symbol_at(isym))->name : ""; - } + const char * symbol_name() const; literal_t& set( size_t len, char *data, const char prefix[] ) { @@ -76,17 +109,8 @@ return *this; } - literal_t& - set( const cbl_field_t * field ) { - assert(field->has_attr(constant_e)); - assert(is_literal(field)); - - set_prefix( "", 0 ); - set_data( field->data.capacity, - const_cast(field->data.initial), - field_index(field) ); - return *this; - } + literal_t& set( const cbl_field_t * field ); + literal_t& set_data( size_t len, char *data, size_t isym = 0 ) { this->isym = isym; @@ -99,36 +123,8 @@ } return *this; } - literal_t& - set_prefix( const char *input, size_t len ) { - encoding = current_encoding('A'); - assert(len < sizeof(prefix)); - std::fill(prefix, prefix + sizeof(prefix), '\0'); - std::transform(input, input + len, prefix, toupper); - switch(prefix[0]) { - case '\0': case 'Z': - encoding = current_encoding('A'); - break; - case 'N': - encoding = current_encoding('N'); - if( 'X' == prefix[1] ) { - cbl_unimplemented("NX literals"); - } - break; - case 'G': - cbl_unimplemented("DBCS encoding not supported"); - break; - case 'U': - encoding = UTF8_e; - break; - case 'X': - break; - default: - gcc_unreachable(); - } - assert(encoding <= iconv_YU_e); - return *this; - } + literal_t& set_prefix( const char *input, size_t len ); + bool compatible_prefix( const literal_t& that ) const { if( prefix[0] != that.prefix[0] ) { @@ -456,7 +452,7 @@ CF CH CHANGED CHAR CHAR_NATIONAL "CHAR-NATIONAL" CHARACTER CHARACTERS CHECKING CLASS - COBOL CODE CODESET COLLATING + COBOL CODE CODESET "CODE-SET" COLLATING COLUMN COMBINED_DATETIME "COMBINED-DATETIME" COMMA COMMAND_LINE "COMMAND-LINE" COMMAND_LINE_COUNT "COMMAND-LINE-COUNT" @@ -524,7 +520,7 @@ INTEGER_OF_DAY "INTEGER-OF-DAY" INTEGER_OF_FORMATTED_DATE "INTEGER-OF-FORMATTED-DATE" INTEGER_PART "INTEGER-PART" - INTO INTRINSIC INVOKE IO IO_CONTROL "IO-CONTROL" + INTO INTRINSIC INVOKE IO "I-O" IO_CONTROL "I-O-CONTROL" IS ISNT "IS NOT" KANJI KEY @@ -600,7 +596,7 @@ STATUS STRONG SUBSTITUTE SUM SYMBOL SYMBOLIC SYNCHRONIZED - TALLY TALLYING TAN TERMINATE TEST + TALLYING TAN TERMINATE TEST TEST_DATE_YYYYMMDD "TEST-DATE-YYYYMMDD" TEST_DAY_YYYYDDD "TEST-DAY-YYYYDDD" TEST_FORMATTED_DATETIME "TEST-FORMATTED-DATETIME" @@ -663,6 +659,8 @@ UNDERLINE UNSIGNED_kw UTF_16 "UTF-16" UTF_8 "UTF-8" + XMLGENERATE "XML GENERATE" + XMLPARSE "XML PARSE" ADDRESS END_ACCEPT "END-ACCEPT" @@ -814,6 +812,7 @@ %type on_overflow on_overflows %type arith_err arith_errs %type accept_except accept_excepts call_except call_excepts + %type compute_body %type ffi_name set_operand set_tgt scalar_arg unstring_src @@ -837,6 +836,12 @@ %type mistake globally first_last %type io_mode +%type xmlprocs +%type xmlexcept xmlexcepts +%type xmlencoding xmlvalidating +%type xmlreturning +%type