From: Robert Dubner Date: Thu, 26 Feb 2026 19:42:51 +0000 (-0500) Subject: cobol: Fix FUNCTION TRIM. X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=dc2f983e5261fc27319f730fc2c9e01faac206ca;p=thirdparty%2Fgcc.git cobol: Fix FUNCTION TRIM. The FUNCTION TRIM now works properly with UTF16 inputs. According to the ISO specification, the return type of a number of intrinsic functions is defined by the variable type of their first parameter. A number of changes here cause more functions to honor that requirement. gcc/cobol/ChangeLog: * parse.y: BASECONVERT and TRIM take their type from their first parameter. * parse_util.h (intrinsic_return_field): The function_descrs[] is adjusted so that a number of functions take their return type from their first calling parameter. intrinsic_return_field() has been refined. * symbols.cc (new_alphanumeric): Use set_explicit() instead of set() in support of refined intrinsic function return type. libgcobol/ChangeLog: * intrinsic.cc (__gg__trim): Rewritten to work properly, and avoid unnecessary variable codeset encoding translation. --- diff --git a/gcc/cobol/parse.y b/gcc/cobol/parse.y index 3ab0daa4c18..df7f29f9ce8 100644 --- a/gcc/cobol/parse.y +++ b/gcc/cobol/parse.y @@ -10892,7 +10892,7 @@ intrinsic: function_udf | BASECONVERT '(' varg[r1] varg[r2] varg[r3] ')' { location_set(@1); - $$ = new_alphanumeric("BASECONVERT"); + $$ = new_alphanumeric("BASECONVERT", $r1->field->codeset.encoding); cbl_unimplemented("BASECONVERT"); if( ! intrinsic_call_3($$, BASECONVERT, $r1, $r2, $r3 )) YYERROR; } @@ -11223,7 +11223,7 @@ intrinsic: function_udf YYERROR; break; } - $$ = new_alphanumeric("TRIM"); + $$ = new_alphanumeric("TRIM", $r1->field->codeset.encoding); cbl_refer_t * how = new_reference($trim_trailing); if( ! intrinsic_call_2($$, TRIM, $r1, how) ) YYERROR; } diff --git a/gcc/cobol/parse_util.h b/gcc/cobol/parse_util.h index e3bcc79a68f..0537c60b5fd 100644 --- a/gcc/cobol/parse_util.h +++ b/gcc/cobol/parse_util.h @@ -46,15 +46,22 @@ * n variadic * We use just A, I, N, or X, choosing the most general for each parameter. * - * When FldInvalid is shown as the return type, it indicates that the type - * of the function is determined by the type of the first parameter. + * When FldInvalid is shown as the return type, it indicates that the Integer + * vs. Numeric type of the function is determined by the type of the first + * parameter. + * + * FldGroup is used when the first argument determines the encoding of the + * temporary. This is for functions that can be Alphanumeric or National. * * We use FldNumericBin5 for functions of type "Integer", and FldFloat for * functions of type "Numeric", */ + #define IntOrNum FldInvalid + #define AnumOrNat FldGroup + static const function_descr_t function_descrs[] = { { ABS, "ABS", - "__gg__abs", "N", {}, FldInvalid }, + "__gg__abs", "N", {}, IntOrNum }, { ACOS, "ACOS", "__gg__acos", "N", {}, FldFloat }, { ANNUITY, "ANNUITY", @@ -63,7 +70,7 @@ static const function_descr_t function_descrs[] = { "__gg__asin", "N", {}, FldFloat }, { ATAN, "ATAN", "__gg__atan", "N", {}, FldFloat }, - { BASECONVERT, "BASECONVERT", + { BASECONVERT, "BASECONVERT", // See parse.y "__gg__baseconvert", "XII", {}, FldAlphanumeric }, { BIT_OF, "BIT-OF", "__gg__bit_of", "X", {}, FldAlphanumeric }, @@ -81,9 +88,9 @@ static const function_descr_t function_descrs[] = { { COMBINED_DATETIME, "COMBINED-DATETIME", "__gg__combined_datetime", "IN", {}, FldFloat }, { CONCAT, "CONCAT", - "__gg__concat", "n", {}, FldAlphanumeric }, + "__gg__concat", "n", {}, AnumOrNat }, { CONVERT, "CONVERT", - "__gg__convert", "XII", {}, FldAlphanumeric }, + "__gg__convert", "XII", {}, AnumOrNat }, { COS, "COS", "__gg__cos", "N", {}, FldFloat }, { CURRENT_DATE, "CURRENT-DATE", @@ -121,13 +128,13 @@ static const function_descr_t function_descrs[] = { { FIND_STRING, "FIND-STRING", "__gg__find_string", "AXI", {}, FldNumericBin5 }, { FORMATTED_CURRENT_DATE, "FORMATTED-CURRENT-DATE", - "__gg__formatted_current_date", "X", {}, FldAlphanumeric }, + "__gg__formatted_current_date", "X", {}, AnumOrNat }, { FORMATTED_DATE, "FORMATTED-DATE", - "__gg__formatted_date", "XX", {}, FldAlphanumeric }, + "__gg__formatted_date", "XX", {}, AnumOrNat }, { FORMATTED_DATETIME, "FORMATTED-DATETIME", - "__gg__formatted_datetime", "XINI", {}, FldAlphanumeric }, + "__gg__formatted_datetime", "XINI", {}, AnumOrNat }, { FORMATTED_TIME, "FORMATTED-TIME", - "__gg__formatted_time", "INI", {}, FldAlphanumeric }, + "__gg__formatted_time", "INI", {}, AnumOrNat }, { FRACTION_PART, "FRACTION-PART", "__gg__fraction_part", "N", {}, FldFloat }, { HEX_OF, "HEX-OF", @@ -135,7 +142,7 @@ static const function_descr_t function_descrs[] = { { HEX_TO_CHAR, "HEX-TO-CHAR", "__gg__hex_to_char", "X", {}, FldAlphanumeric }, { HIGHEST_ALGEBRAIC, "HIGHEST-ALGEBRAIC", - "__gg__highest_algebraic", "N", {}, FldInvalid }, + "__gg__highest_algebraic", "N", {}, IntOrNum }, { INTEGER, "INTEGER", "__gg__integer", "N", {}, FldNumericBin5 }, // requires FldBoolean @@ -164,11 +171,11 @@ static const function_descr_t function_descrs[] = { { LOG10, "LOG10", "__gg__log10", "N", {}, FldFloat }, { LOWER_CASE, "LOWER-CASE", - "__gg__lower_case", "X", {}, FldAlphanumeric }, + "__gg__lower_case", "X", {}, AnumOrNat }, { LOWEST_ALGEBRAIC, "LOWEST-ALGEBRAIC", - "__gg__lowest_algebraic", "N", {}, FldInvalid }, + "__gg__lowest_algebraic", "N", {}, IntOrNum }, { MAXX, "MAX", - "__gg__max", "n", {}, FldInvalid }, + "__gg__max", "n", {}, IntOrNum }, { MEAN, "MEAN", "__gg__mean", "n", {}, FldFloat }, { MEDIAN, "MEDIAN", @@ -176,7 +183,7 @@ static const function_descr_t function_descrs[] = { { MIDRANGE, "MIDRANGE", "__gg__midrange", "n", {}, FldFloat }, { MINN, "MIN", - "__gg__min", "n", {}, FldInvalid }, + "__gg__min", "n", {}, IntOrNum }, { MOD, "MOD", "__gg__mod", "IN", {}, FldNumericBin5 }, { MODULE_NAME, "MODULE-NAME", @@ -202,11 +209,11 @@ static const function_descr_t function_descrs[] = { { RANDOM, "RANDOM", "__gg__random", "I", {}, FldFloat }, { RANGE, "RANGE", - "__gg__range", "n", {}, FldInvalid }, + "__gg__range", "n", {}, IntOrNum }, { REM, "REM", "__gg__rem", "NN", {}, FldFloat }, { REVERSE, "REVERSE", - "__gg__reverse", "X", {}, FldAlphanumeric }, + "__gg__reverse", "X", {}, AnumOrNat }, { SECONDS_FROM_FORMATTED_TIME, "SECONDS-FROM-FORMATTED-TIME", "__gg__seconds_from_formatted_time", "XX", {}, FldFloat }, { SECONDS_PAST_MIDNIGHT, "SECONDS_PAST_MIDNIGHT", @@ -216,7 +223,7 @@ static const function_descr_t function_descrs[] = { { SIN, "SIN", "__gg__sin", "N", {}, FldFloat }, { SMALLEST_ALGEBRAIC, "SMALLEST-ALGEBRAIC", - "__gg__smallest_algebraic", "N", {}, FldInvalid }, + "__gg__smallest_algebraic", "N", {}, IntOrNum }, { SQRT, "SQRT", "__gg__sqrt", "N", {}, FldFloat }, { STANDARD_COMPARE, "STANDARD-COMPARE", @@ -224,9 +231,9 @@ static const function_descr_t function_descrs[] = { { STANDARD_DEVIATION, "STANDARD-DEVIATION", "__gg__standard_deviation", "n", {}, FldFloat }, { SUBSTITUTE, "SUBSTITUTE", - "__gg__substitute", "XXX", {}, FldAlphanumeric }, + "__gg__substitute", "XXX", {}, AnumOrNat }, { SUM, "SUM", - "__gg__sum", "n", {}, FldInvalid }, + "__gg__sum", "n", {}, IntOrNum }, { TAN, "TAN", "__gg__tan", "N", {}, FldFloat }, { TEST_DATE_YYYYMMDD, "TEST-DATE-YYYYMMDD", @@ -241,8 +248,8 @@ static const function_descr_t function_descrs[] = { "__gg__test_numval_c", "XXU", {}, FldNumericBin5 }, { TEST_NUMVAL_F, "TEST-NUMVAL-F", "__gg__test_numval_f", "X", {}, FldNumericBin5 }, - { TRIM, "TRIM", - "__gg__trim", "XI", {}, FldNumericBin5 }, + { TRIM, "TRIM", // See parse.y + "__gg__trim", "XI", {}, FldAlphanumeric }, { ULENGTH, "ULENGTH", "__gg__ulength", "X", {}, FldAlphanumeric }, { UPOS, "UPOS", @@ -342,7 +349,8 @@ intrinsic_return_field(int token, std::vector args) retval = new_tempnumeric_float(); break; case FldInvalid: - // This is a flag that a function takes the type of its first input + // This is a flag that a function takes the Numeric vs Int type of its + // first argument assert( args.size() ); switch(args[0].field->type) { @@ -350,7 +358,7 @@ intrinsic_return_field(int token, std::vector args) case FldAlphanumeric: case FldAlphaEdited: case FldLiteralA: - retval = new_alphanumeric(); + retval = new_alphanumeric(NULL, args[0].field->codeset.encoding); break; case FldNumericBinary: case FldPacked: @@ -370,6 +378,36 @@ intrinsic_return_field(int token, std::vector args) break; } break; + + case FldGroup: + // This is a flag that an alphanumeric function takes the encoding of the + // first argument + assert( args.size() ); + switch(args[0].field->type) + { + case FldGroup: + case FldAlphanumeric: + case FldAlphaEdited: + case FldLiteralA: + case FldNumericBinary: + case FldPacked: + case FldNumericDisplay: + case FldNumericBin5: + case FldLiteralN: + case FldIndex: + case FldPointer: + retval = new_alphanumeric(NULL, args[0].field->codeset.encoding); + break; + case FldFloat: + retval = new_tempnumeric_float(); + break; + default: + retval = NULL; + gcc_unreachable(); + break; + } + break; + default: retval = NULL; gcc_unreachable(); diff --git a/gcc/cobol/symbols.cc b/gcc/cobol/symbols.cc index 99d638e8c16..55c40ffa5ca 100644 --- a/gcc/cobol/symbols.cc +++ b/gcc/cobol/symbols.cc @@ -3783,14 +3783,22 @@ symbol_temporaries_free() { cbl_field_t * new_alphanumeric( const cbl_name_t name, cbl_encoding_t encoding ) { cbl_field_t * field = new_temporary_impl(FldAlphanumeric, name); +//// if( encoding != no_encoding_e ) { +//// field->codeset.set(encoding); +//// } +//// //// Dubner hacking away: If name is non-null, then assume this is a +//// //// function definition, and force the codeset, which otherwise will have +//// //// defaulted to current_encoding('A'), and the valid() test in codeset.set +//// //// will have prevented it from being changed. +//// if( name && encoding != no_encoding_e ) { +//// field->codeset.set_explicit(encoding); +//// } + /* Jim's original code was hedged with protections apparently intended to + prevent encodings from changing. This proved unsatisfactor, especially + when I started implementing setting the temporary return type of functions + that take on the characteristics of their first parameter. So, I went + from codeset.set_encoding() to codeset.set_explicit(). */ if( encoding != no_encoding_e ) { - field->codeset.set(encoding); - } - //// Dubner hacking away: If name is non-null, then assume this is a - //// function definition, and force the codeset, which otherwise will have - //// defaulted to current_encoding('A'), and the valid() test in codeset.set - //// will have prevented it from being changed. - if( name && encoding != no_encoding_e ) { field->codeset.set_explicit(encoding); } temporaries.add(field); diff --git a/libgcobol/intrinsic.cc b/libgcobol/intrinsic.cc index d3206b89679..8c1119e1dee 100644 --- a/libgcobol/intrinsic.cc +++ b/libgcobol/intrinsic.cc @@ -3527,9 +3527,11 @@ __gg__trim( cblc_field_t *dest, size_t arg2_offset, size_t arg2_size) { - cbl_encoding_t from = arg1->encoding; - cbl_encoding_t to = dest->encoding; - charmap_t *charmap = __gg__get_charmap(to); + // We assume that dest is an intermediate_e with the same encoding as arg1. + assert( dest->type == FldAlphanumeric + && (dest->attr & intermediate_e) + && dest->encoding == arg1->encoding ); + charmap_t *charmap = __gg__get_charmap(arg1->encoding); int stride = charmap->stride(); cbl_char_t mapped_space = charmap->mapped_character(ascii_space); @@ -3539,80 +3541,61 @@ __gg__trim( cblc_field_t *dest, arg2_offset, arg2_size); //static const int BOTH = 0; - static const int LEADING = 1; // Remove leading spaces - static const int TRAILING = 2; // Remove trailing spaces - - if( dest->type != FldAlphanumeric || - !(dest->attr & intermediate_e) ) - { - fprintf(stderr, - "We expect the target of a FUNCTION TRIM to " - "be an intermediate alphanumeric\n"); - abort(); - } + #define LEADING 1 // Remove leading spaces + #define TRAILING 2 // Remove trailing spaces - // What is this all about? - dest->capacity = dest->offset; - - // Make a copy of the input: - char *copy = static_cast(malloc(arg1_size)); - massert(copy); - memcpy(copy, arg1->data+arg1_offset, arg1_size); - - // Convert it to the destination encoding - __gg__convert_encoding_length(copy, arg1_size, from, to); - - // No matter what, we want to find the leftmost non-space and the - // rightmost non-space: - - char *left = copy; - char *right = left + arg1_size-stride; - - // Find left and right: the first and last non-spaces - while( left <= right ) + char *left = reinterpret_cast(arg1->data) + arg1_offset; + char *right = left + arg1_size-stride; // Points AT the character, not beyond + switch(type) { - cbl_char_t cleft = charmap->getch(left, (size_t)0); - cbl_char_t cright = charmap->getch(right, (size_t)0); - - if( cleft != mapped_space && cright != mapped_space ) - { + case 0: // Strip off leading and trailing spaces + while(left <= right) + { + if( charmap->getch(left, (size_t)0) != mapped_space ) + { + break; + } + left += stride; + } + while(left <= right) + { + if( charmap->getch(right, (size_t)0) != mapped_space ) + { + break; + } + right -= stride; + } break; - } - if( cleft == mapped_space ) + + case LEADING: // Just leading { - left += stride; + while(left <= right) + { + if( charmap->getch(left, (size_t)0) != mapped_space ) + { + break; + } + left += stride; + } + break; } - if( cright == mapped_space ) + + case TRAILING: // Just trailing { - right -= stride; + while(left <= right) + { + if( charmap->getch(right, (size_t)0) != mapped_space ) + { + break; + } + right -= stride; + } + break; } } - if( type == LEADING ) - { - // We want to leave any trailing spaces, so we return 'right' to its - // original value: - right = copy + arg1_size-1; - } - else if( type == TRAILING ) - { - // We want to leave any leading spaces, so we return 'left' to its - // original value: - left = copy; - } - - if( left > right ) - { - // When the arg1 input string was empty, we want left to be right+1. - // The left/right loop can sometimes end up with left equal to right+2. - // That needs to be fixed: - left = right+stride; - } - size_t ncount = right+stride - left; __gg__adjust_dest_size(dest, ncount); - memmove(dest->data, left, ncount); - free(copy); } #if HAVE_INITSTATE_R && HAVE_SRANDOM_R && HAVE_RANDOM_R