From: Bruno Haible Date: Mon, 29 Dec 2025 18:51:19 +0000 (+0100) Subject: New program 'msgpre'. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e549b33dfab93cd3cbaf58f8062fe29f87ee8d99;p=thirdparty%2Fgettext.git New program 'msgpre'. * gettext-tools/src/msgpre.c: New file, based on gettext-tools/src/msgfilter.c. * gettext-tools/src/FILES: Describe msgpre.c. * gettext-tools/src/Makefile.am (bin_PROGRAMS): Add msgpre. (msgpre_SOURCES, msgpre_LDADD, msgpre_DEPENDENCIES, msgpre_CPPFLAGS, msgpre_LDFLAGS): New variables. * gettext-tools/po/POTFILES.in: Add msgpre.c. * gettext-tools/man/msgpre.x: New file. * gettext-tools/man/Makefile.am (man_aux): Add msgpre.x. (man_MAN1SRC): Add msgpre.1. (man_HTML): Add msgpre.1.html. (msgpre.1, msgpre.1.html): Add dependencies. * gettext-tools/doc/gettext.texi (msgpre Invocation): New section. * gettext-tools/doc/msgpre.texi: New file. * gettext-tools/doc/Makefile.am (gettext_TEXINFOS): Add it. * gettext-tools/Makefile.am (programs-for-distdir): Create msgpre. (distdir1, gen-man1): Update for the msgpre.1 manual page. * NEWS: Mention the new program. --- diff --git a/.gitignore b/.gitignore index 4f68013ae..13112eac1 100644 --- a/.gitignore +++ b/.gitignore @@ -545,6 +545,8 @@ /gettext-tools/man/msginit.1.html /gettext-tools/man/msgmerge.1 /gettext-tools/man/msgmerge.1.html +/gettext-tools/man/msgpre.1 +/gettext-tools/man/msgpre.1.html /gettext-tools/man/msgunfmt.1 /gettext-tools/man/msgunfmt.1.html /gettext-tools/man/msguniq.1 @@ -790,6 +792,8 @@ autom4te.cache/ /gettext-tools/src/msginit.exe /gettext-tools/src/msgmerge /gettext-tools/src/msgmerge.exe +/gettext-tools/src/msgpre +/gettext-tools/src/msgpre.exe /gettext-tools/src/msgunfmt /gettext-tools/src/msgunfmt.exe /gettext-tools/src/msguniq diff --git a/Admin/release-steps b/Admin/release-steps index 387396e84..488c6fe97 100644 --- a/Admin/release-steps +++ b/Admin/release-steps @@ -140,6 +140,7 @@ We assume that the following environment variables are set: gettext-tools/src/msggrep.c gettext-tools/src/msginit.c gettext-tools/src/msgmerge.c + gettext-tools/src/msgpre.c gettext-tools/src/msgunfmt.c gettext-tools/src/msguniq.c gettext-tools/src/recode-sr-latin.c diff --git a/NEWS b/NEWS index 2ba5da5dd..43543a287 100644 --- a/NEWS +++ b/NEWS @@ -37,8 +37,10 @@ Version 1.0 - December 2025 an error message in this situation. * Pretranslation: - - A new program 'spit' is provided, that implements machine translation - through a locally installed Large Language Model (LLM). + - Two new programs, 'msgpre' and 'spit', are provided, that implement + machine translation through a locally installed Large Language Model + (LLM). 'msgpre' applies to an entire PO file, 'spit' to a single + message. - The documentation has a new chapter "Pretranslation". # Programming languages support: diff --git a/gettext-tools/Makefile.am b/gettext-tools/Makefile.am index 11214dcfc..e045161c3 100644 --- a/gettext-tools/Makefile.am +++ b/gettext-tools/Makefile.am @@ -178,6 +178,7 @@ programs-for-distdir: msgfilter$(EXEEXT) \ msggrep$(EXEEXT) \ msginit$(EXEEXT) \ + msgpre$(EXEEXT) \ msguniq$(EXEEXT) \ recode-sr-latin$(EXEEXT) @@ -200,6 +201,7 @@ distdir1: \ man/msgfilter.1 \ man/msggrep.1 \ man/msginit.1 \ + man/msgpre.1 \ man/msguniq.1 \ man/recode-sr-latin.1 man/msgcmp.1 \ @@ -216,6 +218,7 @@ man/msgcmp.1 \ man/msgfilter.1 \ man/msggrep.1 \ man/msginit.1 \ + man/msgpre.1 \ man/msguniq.1 \ man/recode-sr-latin.1 \ : gen-man1 @@ -235,6 +238,7 @@ gen-man1: programs-for-distdir \ src/msgfilter.c man/msgfilter.x \ src/msggrep.c man/msggrep.x \ src/msginit.c man/msginit.x \ + src/msgpre.c man/msgpre.x \ src/msguniq.c man/msguniq.x \ src/recode-sr-latin.c man/recode-sr-latin.x cd man \ @@ -253,6 +257,7 @@ gen-man1: programs-for-distdir \ msgfilter.1 \ msggrep.1 \ msginit.1 \ + msgpre.1 \ msguniq.1 \ recode-sr-latin.1 diff --git a/gettext-tools/doc/Makefile.am b/gettext-tools/doc/Makefile.am index fefe79226..2d5d37ca2 100644 --- a/gettext-tools/doc/Makefile.am +++ b/gettext-tools/doc/Makefile.am @@ -45,6 +45,7 @@ gettext_TEXINFOS = \ xgettext.texi \ msginit.texi \ msgmerge.texi \ + msgpre.texi \ spit.texi \ msgcat.texi \ msgconv.texi \ diff --git a/gettext-tools/doc/gettext.texi b/gettext-tools/doc/gettext.texi index fb32044c2..9b8f38829 100644 --- a/gettext-tools/doc/gettext.texi +++ b/gettext-tools/doc/gettext.texi @@ -258,6 +258,7 @@ Updating Existing PO Files Pretranslating PO Files * Installing an LLM:: Installing a Large Language Model +* msgpre Invocation:: Invoking the @code{msgpre} Program * spit Invocation:: Invoking the @code{spit} Program Editing PO Files @@ -4113,6 +4114,11 @@ If you are running @code{ollama} in a virtual machine, make the port 11434 accessible through port forwarding. @end itemize +@node msgpre Invocation +@section Invoking the @code{msgpre} Program + +@include msgpre.texi + @node spit Invocation @section Invoking the @code{spit} Program diff --git a/gettext-tools/doc/msgpre.texi b/gettext-tools/doc/msgpre.texi new file mode 100644 index 000000000..db45896fc --- /dev/null +++ b/gettext-tools/doc/msgpre.texi @@ -0,0 +1,245 @@ +@c This file is part of the GNU gettext manual. +@c Copyright (C) 1995-2025 Free Software Foundation, Inc. +@c See the file gettext.texi for copying conditions. + +@pindex msgpre +@cindex @code{msgpre} program, usage +@example +msgpre [@var{option}...] +@end example + +@cindex pretranslate a message catalog +@cindex translate through a Large Language Model +The @code{msgpre} program pretranslates a translation catalog. + +@strong{Warning:} The pretranslations might not be what you expect. +They might be of the wrong form, be of poor quality, or reflect some biases. + +@subsection Input file location + +@table @samp +@item -i @var{inputfile} +@itemx --input=@var{inputfile} +@opindex -i@r{, @code{msgpre} option} +@opindex --input@r{, @code{msgpre} option} +Input PO file. + +@item -D @var{directory} +@itemx --directory=@var{directory} +@opindex -D@r{, @code{msgpre} option} +@opindex --directory@r{, @code{msgpre} option} +Add @var{directory} to the list of directories. Source files are +searched relative to this list of directories. The resulting @file{.po} +file will be written relative to the current directory, though. + +@end table + +If no @var{inputfile} is given or if it is @samp{-}, standard input is read. + +@subsection Output file location + +@table @samp +@item -o @var{file} +@itemx --output-file=@var{file} +@opindex -o@r{, @code{msgpre} option} +@opindex --output-file@r{, @code{msgpre} option} +Write output to specified file. + +@end table + +The results are written to standard output if no output file is specified +or if it is @samp{-}. + +@subsection Message selection + +@table @samp +@item --keep-fuzzy +@opindex --keep-fuzzy@r{, @code{msgpre} option} +Keep fuzzy messages unmodified. +Pretranslate only untranslated messages. + +@end table + +@subsection Large Language Model (LLM) options + +@table @samp +@item --species=@var{type} +@opindex --species@r{, @code{msgpre} option} +Specifies the type of Large Language Model execution engine. +The default and only valid value is @code{ollama}. + +@item --url=@var{url} +@opindex --url@r{, @code{msgpre} option} +Specifies the URL of the server that runs Large Language Model execution engine. +For @code{ollama}, the default is @code{http://localhost:11434}. + +@item -m @var{model} +@itemx --model=@var{model} +@opindex -m@r{, @code{msgpre} option} +@opindex --model@r{, @code{msgpre} option} +Specifies the model to use. +This option is mandatory; no default exists. +The specified model must +already be installed in the Large Language Model execution engine. + +@item --prompt=@var{text} +@opindex --prompt@r{, @code{msgpre} option} +Specifies the prompt to use before each @code{msgid} from the PO file. +It allows you to specify extra instructions for the LLM. +The prompt should include an instruction like +"Translate into @var{target language}.". +Some hints for good prompts are described in the article +``How to write AI prompts for translation'' +@url{https://poeditor.com/blog/ai-prompts-for-translation/}. + +@item --postprocess=@var{command} +@opindex --postprocess@r{, @code{msgpre} option} +Specifies a command to post-process the output from the LLM. +This should be a Bourne shell command +that reads from standard input and writes to standard output. + +For instance, the @code{ministral-3:14b} model +often emphasizes part of the output with @samp{**} characters. +To eliminate these markers, +you could use the command @samp{sed -e 's/[*][*]//g'}. + +@end table + +@subsection Input file syntax + +@table @samp +@item -P +@itemx --properties-input +@opindex -P@r{, @code{msgpre} option} +@opindex --properties-input@r{, @code{msgpre} option} +Assume the input file is a Java ResourceBundle in Java @code{.properties} +syntax, not in PO file syntax. + +@item --stringtable-input +@opindex --stringtable-input@r{, @code{msgpre} option} +Assume the input file is a NeXTstep/GNUstep localized resource file in +@code{.strings} syntax, not in PO file syntax. + +@end table + +@subsection Output details + +@table @samp +@item --color +@itemx --color=@var{when} +@opindex --color@r{, @code{msgpre} option} +Specify whether or when to use colors and other text attributes. +See @ref{The --color option} for details. + +@item --style=@var{style_file} +@opindex --style@r{, @code{msgpre} option} +Specify the CSS style rule file to use for @code{--color}. +See @ref{The --style option} for details. + +@item --force-po +@opindex --force-po@r{, @code{msgpre} option} +Always write an output file even if it contains no message. + +@item --indent +@opindex --indent@r{, @code{msgpre} option} +Write the .po file using indented style. + +@item --no-location +@opindex --no-location@r{, @code{msgpre} option} +Do not write @samp{#: @var{filename}:@var{line}} lines. + +@item -n +@itemx --add-location=@var{type} +@opindex --add-location@r{, @code{msgpre} option} +Generate @samp{#: @var{filename}:@var{line}} lines (default). + +The optional @var{type} can be either @samp{full}, @samp{file}, or +@samp{never}. If it is not given or @samp{full}, it generates the +lines with both file name and line number. If it is @samp{file}, the +line number part is omitted. If it is @samp{never}, it completely +suppresses the lines (same as @code{--no-location}). + +@item --strict +@opindex --strict@r{, @code{msgpre} option} +Write out a strict Uniforum conforming PO file. Note that this +Uniforum format should be avoided because it doesn't support the +GNU extensions. + +@item -p +@itemx --properties-output +@opindex -p@r{, @code{msgpre} option} +@opindex --properties-output@r{, @code{msgpre} option} +Write out a Java ResourceBundle in Java @code{.properties} syntax. Note +that this file format doesn't support plural forms and silently drops +obsolete messages. + +@item --stringtable-output +@opindex --stringtable-output@r{, @code{msgpre} option} +Write out a NeXTstep/GNUstep localized resource file in @code{.strings} syntax. +Note that this file format doesn't support plural forms. + +@item -w @var{number} +@itemx --width=@var{number} +@opindex -w@r{, @code{msgpre} option} +@opindex --width@r{, @code{msgpre} option} +Set the output page width. Long strings in the output files will be +split across multiple lines in order to ensure that each line's width +(= number of screen columns) is less or equal to the given @var{number}. + +@item --no-wrap +@opindex --no-wrap@r{, @code{msgpre} option} +Do not break long message lines. Message lines whose width exceeds the +output page width will not be split into several lines. Only file reference +lines which are wider than the output page width will be split. + +@item -s +@itemx --sort-output +@opindex -s@r{, @code{msgpre} option} +@opindex --sort-output@r{, @code{msgpre} option} +Generate sorted output. Note that using this option makes it much harder +for the translator to understand each message's context. + +@item -F +@itemx --sort-by-file +@opindex -F@r{, @code{msgpre} option} +@opindex --sort-by-file@r{, @code{msgpre} option} +Sort output by file location. + +@end table + +@subsection Informative output + +@table @samp +@item -h +@itemx --help +@opindex -h@r{, @code{msgpre} option} +@opindex --help@r{, @code{msgpre} option} +Display this help and exit. + +@item -V +@itemx --version +@opindex -V@r{, @code{msgpre} option} +@opindex --version@r{, @code{msgpre} option} +Output version information and exit. + +@item -q +@itemx --quiet +@itemx --silent +@opindex -q@r{, @code{msgpre} option} +@opindex --quiet@r{, @code{msgpre} option} +@opindex --silent@r{, @code{msgpre} option} +Suppress progress indicators. + +@end table + +@subsection Examples + +To pretranslate the file @code{foo.po}: + +@smallexample +msgpre --model=ministral-3:14b < foo.po > foo-pretranslated.po +@end smallexample + +@noindent +Note that this command can take a long time, +depending on the model and the available hardware. diff --git a/gettext-tools/man/Makefile.am b/gettext-tools/man/Makefile.am index 9dcdd3587..35628bba0 100644 --- a/gettext-tools/man/Makefile.am +++ b/gettext-tools/man/Makefile.am @@ -27,7 +27,7 @@ EXTRA_DIST = man_aux = \ msgcmp.x msgfmt.x msgmerge.x msgunfmt.x xgettext.x \ msgattrib.x msgcat.x msgcomm.x msgconv.x msgen.x msgexec.x msgfilter.x \ -msggrep.x msginit.x msguniq.x \ +msggrep.x msginit.x msgpre.x msguniq.x \ recode-sr-latin.x \ spit.x \ gettextize.x autopoint.x @@ -37,7 +37,7 @@ gettextize.x autopoint.x man_MAN1SRC = \ msgcmp.1 msgfmt.1 msgmerge.1 msgunfmt.1 xgettext.1 \ msgattrib.1 msgcat.1 msgcomm.1 msgconv.1 msgen.1 msgexec.1 msgfilter.1 \ -msggrep.1 msginit.1 msguniq.1 \ +msggrep.1 msginit.1 msgpre.1 msguniq.1 \ recode-sr-latin.1 \ spit.1 man_MAN1WIZARD = \ @@ -50,7 +50,8 @@ man_MANS = $(man_MAN1) man_HTML = \ msgcmp.1.html msgfmt.1.html msgmerge.1.html msgunfmt.1.html xgettext.1.html \ msgattrib.1.html msgcat.1.html msgcomm.1.html msgconv.1.html msgen.1.html \ -msgexec.1.html msgfilter.1.html msggrep.1.html msginit.1.html msguniq.1.html \ +msgexec.1.html msgfilter.1.html msggrep.1.html msginit.1.html msgpre.1.html \ +msguniq.1.html \ recode-sr-latin.1.html \ spit.1.html \ gettextize.1.html autopoint.1.html @@ -120,6 +121,7 @@ msgexec.1: msgexec.x ../src/msgexec.c msgfilter.1: msgfilter.x ../src/msgfilter.c msggrep.1: msggrep.x ../src/msggrep.c msginit.1: msginit.x ../src/msginit.c +msgpre.1: msgpre.x ../src/msgpre.c msguniq.1: msguniq.x ../src/msguniq.c recode-sr-latin.1: recode-sr-latin.x ../src/recode-sr-latin.c spit.1: spit.x ../src/spit.c @@ -164,6 +166,7 @@ msgexec.1.html: msgexec.1 msgfilter.1.html: msgfilter.1 msggrep.1.html: msggrep.1 msginit.1.html: msginit.1 +msgpre.1.html: msgpre.1 msguniq.1.html: msguniq.1 recode-sr-latin.1.html: recode-sr-latin.1 spit.1.html: spit.1 diff --git a/gettext-tools/man/msgpre.x b/gettext-tools/man/msgpre.x new file mode 100644 index 000000000..e9fb418e6 --- /dev/null +++ b/gettext-tools/man/msgpre.x @@ -0,0 +1,4 @@ +[NAME] +msgpre \- pretranslate a message catalog +[DESCRIPTION] +.\" Add any additional description here diff --git a/gettext-tools/po/POTFILES.in b/gettext-tools/po/POTFILES.in index 7623c6491..3ec669343 100644 --- a/gettext-tools/po/POTFILES.in +++ b/gettext-tools/po/POTFILES.in @@ -67,6 +67,7 @@ src/msgl-check.c src/msgl-iconv.c src/msgl-merge.c src/msgmerge.c +src/msgpre.c src/msgunfmt.c src/msguniq.c src/open-catalog.c diff --git a/gettext-tools/src/FILES b/gettext-tools/src/FILES index dd424b007..d884545f9 100644 --- a/gettext-tools/src/FILES +++ b/gettext-tools/src/FILES @@ -199,6 +199,8 @@ msggrep.c Main source for the 'msggrep' program. | The same program, as a Python script. +-------------- The 'spit' program +msgpre.c Main source for the 'msgpre' program. + po-time.h po-time.c Create time stamps for use in PO/POT files. diff --git a/gettext-tools/src/Makefile.am b/gettext-tools/src/Makefile.am index 49a60f478..044117b94 100644 --- a/gettext-tools/src/Makefile.am +++ b/gettext-tools/src/Makefile.am @@ -27,9 +27,10 @@ MAINTAINERCLEANFILES = RM = rm -f bin_PROGRAMS = \ -msgcmp msgfmt msgmerge msgunfmt xgettext \ -msgattrib msgcat msgcomm msgconv msgen msgexec msgfilter msggrep msginit msguniq \ -recode-sr-latin + msgcmp msgfmt msgmerge msgunfmt xgettext \ + msgattrib msgcat msgcomm msgconv msgen msgexec msgfilter msggrep msginit \ + msgpre msguniq \ + recode-sr-latin if BUILD_SPIT_IN_C bin_PROGRAMS += spit endif @@ -408,6 +409,7 @@ endif msginit_SOURCES = msginit.c msginit_SOURCES += msgl-merge.c msginit_SOURCES += ../../gettext-runtime/intl/localealias.c +msgpre_SOURCES = msgpre.c if !WOE32DLL msguniq_SOURCES = msguniq.c else @@ -535,6 +537,7 @@ msgexec_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ $(WOE32_LDADD) msgfilter_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ $(WOE32_LDADD) msggrep_LDADD = $(LIBGREP) libgettextsrc.la @INTL_MACOSX_LIBS@ $(WOE32_LDADD) msginit_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ $(WOE32_LDADD) +msgpre_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ $(WOE32_LDADD) msguniq_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ $(WOE32_LDADD) if BUILD_SPIT_IN_C spit_LDADD = libgettextsrc.la @INTL_MACOSX_LIBS@ $(LIBJSON_C) $(LIBCURL) $(WOE32_LDADD) @@ -556,6 +559,7 @@ msgexec_DEPENDENCIES = libgettextsrc.la ../gnulib-lib/libgettextlib.la $(WOE32_L msgfilter_DEPENDENCIES = libgettextsrc.la ../gnulib-lib/libgettextlib.la $(WOE32_LDADD) msggrep_DEPENDENCIES = $(LIBGREP) libgettextsrc.la ../gnulib-lib/libgettextlib.la $(WOE32_LDADD) msginit_DEPENDENCIES = libgettextsrc.la ../gnulib-lib/libgettextlib.la $(WOE32_LDADD) +msgpre_DEPENDENCIES = libgettextsrc.la ../gnulib-lib/libgettextlib.la $(WOE32_LDADD) msguniq_DEPENDENCIES = libgettextsrc.la ../gnulib-lib/libgettextlib.la $(WOE32_LDADD) recode_sr_latin_DEPENDENCIES = $(OTHERPROGDEPENDENCIES) if BUILD_SPIT_IN_C @@ -579,6 +583,7 @@ msgexec_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=$(bindir_c_make) msgfilter_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=$(bindir_c_make) msggrep_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=$(bindir_c_make) msginit_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=$(bindir_c_make) +msgpre_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=$(bindir_c_make) msguniq_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=$(bindir_c_make) recode_sr_latin_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=$(bindir_c_make) if BUILD_SPIT_IN_C @@ -602,6 +607,7 @@ msgexec_LDFLAGS = `$(RELOCATABLE_LDFLAGS) $(bindir)` msgfilter_LDFLAGS = `$(RELOCATABLE_LDFLAGS) $(bindir)` msggrep_LDFLAGS = `$(RELOCATABLE_LDFLAGS) $(bindir)` msginit_LDFLAGS = `$(RELOCATABLE_LDFLAGS) $(bindir)` +msgpre_LDFLAGS = `$(RELOCATABLE_LDFLAGS) $(bindir)` msguniq_LDFLAGS = `$(RELOCATABLE_LDFLAGS) $(bindir)` recode_sr_latin_LDFLAGS = `$(RELOCATABLE_LDFLAGS) $(bindir)` if BUILD_SPIT_IN_C diff --git a/gettext-tools/src/msgpre.c b/gettext-tools/src/msgpre.c new file mode 100644 index 000000000..fcc1dcc4a --- /dev/null +++ b/gettext-tools/src/msgpre.c @@ -0,0 +1,795 @@ +/* Pretranslate using machine translation. + Copyright (C) 2001-2025 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Bruno Haible , 2025. */ + + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "options.h" +#include "noreturn.h" +#include "closeout.h" +#include "dir-list.h" +#include "xvasprintf.h" +#include "error-progname.h" +#include "progname.h" +#include "relocatable.h" +#include "basename-lgpl.h" +#include "message.h" +#include "read-catalog-file.h" +#include "read-po.h" +#include "read-properties.h" +#include "read-stringtable.h" +#include "write-catalog.h" +#include "write-po.h" +#include "write-properties.h" +#include "write-stringtable.h" +#include "msgl-charset.h" +#include "xalloc.h" +#include "findprog.h" +#include "pipe-filter.h" +#include "msgl-iconv.h" +#include "xerror-handler.h" +#include "po-charset.h" +#include "c-strstr.h" +#include "propername.h" +#include "gettext.h" + +#define _(str) gettext (str) + + +/* We use the 'spit' program as a child process, and communicate through + a bidirectional pipe. */ + + +/* Force output of PO file even if empty. */ +static int force_po; + +/* Keep the fuzzy messages unmodified. */ +static int keep_fuzzy; + +/* Name of the subprogram. */ +static const char *sub_name; + +/* Pathname of the subprogram. */ +static const char *sub_path; + +/* Argument list for the subprogram. */ +static const char **sub_argv; +static int sub_argc; + +/* If true do not print unneeded messages. */ +static bool quiet; + + +/* Forward declaration of local functions. */ +_GL_NORETURN_FUNC static void usage (int status); +static void generic_filter (const char *str, size_t len, char **resultp, size_t *lengthp); +static msgdomain_list_ty *process_msgdomain_list (msgdomain_list_ty *mdlp); + + +int +main (int argc, char **argv) +{ + /* Set program name for messages. */ + set_program_name (argv[0]); + error_print_progname = maybe_print_progname; + gram_max_allowed_errors = 20; + + /* Set locale via LC_ALL. */ + setlocale (LC_ALL, ""); + + /* Set the text message domain. */ + bindtextdomain (PACKAGE, relocate (LOCALEDIR)); + bindtextdomain ("gnulib", relocate (GNULIB_LOCALEDIR)); + bindtextdomain ("bison-runtime", relocate (BISON_LOCALEDIR)); + textdomain (PACKAGE); + + /* Ensure that write errors on stdout are detected. */ + atexit (close_stdout); + + /* Default values for command line options. */ + bool do_help = false; + bool do_version = false; + char *output_file = NULL; + const char *input_file = NULL; + const char *species = "ollama"; + const char *url = "http://localhost:11434"; + const char *model = NULL; + const char *to_language = NULL; + const char *prompt = NULL; + const char *postprocess = NULL; + catalog_input_format_ty input_syntax = &input_format_po; + catalog_output_format_ty output_syntax = &output_format_po; + bool sort_by_filepos = false; + bool sort_by_msgid = false; + quiet = false; + + /* Parse command line options. */ + BEGIN_ALLOW_OMITTING_FIELD_INITIALIZERS + static const struct program_option options[] = + { + { "add-location", CHAR_MAX + 'n', optional_argument }, + { NULL, 'n', no_argument }, + { "color", CHAR_MAX + 6, optional_argument }, + { "directory", 'D', required_argument }, + { "force-po", 0, no_argument, &force_po, 1 }, + { "help", 'h', no_argument }, + { "indent", CHAR_MAX + 8, no_argument }, + { "input", 'i', required_argument }, + { "keep-fuzzy", 0, no_argument, &keep_fuzzy, 1 }, + { "model", 'm', required_argument }, + { "no-location", CHAR_MAX + 9, no_argument }, + { "no-wrap", CHAR_MAX + 12, no_argument }, + { "output-file", 'o', required_argument }, + { "postprocess", CHAR_MAX + 4, required_argument }, + { "prompt", CHAR_MAX + 3, required_argument }, + { "properties-input", 'P', no_argument }, + { "properties-output", 'p', no_argument }, + { "quiet", 'q', no_argument }, + { "silent", 'q', no_argument }, + { "sort-by-file", 'F', no_argument }, + { "sort-output", 's', no_argument }, + { "species", CHAR_MAX + 1, required_argument }, + { "strict", CHAR_MAX + 10, no_argument }, + { "stringtable-input", CHAR_MAX + 5, no_argument }, + { "stringtable-output", CHAR_MAX + 11, no_argument }, + { "style", CHAR_MAX + 7, required_argument }, + { "url", CHAR_MAX + 2, required_argument }, + { "version", 'V', no_argument }, + { "width", 'w', required_argument }, + }; + END_ALLOW_OMITTING_FIELD_INITIALIZERS + /* The flag NON_OPTION_TERMINATES_OPTIONS causes option parsing to terminate + when the first non-option, i.e. the subprogram name, is encountered. */ + start_options (argc, argv, options, NON_OPTION_TERMINATES_OPTIONS, 0); + { + int opt; + while ((opt = get_next_option ()) != -1) + switch (opt) + { + case '\0': /* Long option with key == 0. */ + break; + + case 'i': + if (input_file != NULL) + { + error (EXIT_SUCCESS, 0, _("at most one input file allowed")); + usage (EXIT_FAILURE); + } + input_file = optarg; + break; + + case 'D': + dir_list_append (optarg); + break; + + case 'o': + output_file = optarg; + break; + + case CHAR_MAX + 1: /* --species */ + species = optarg; + break; + + case CHAR_MAX + 2: /* --url */ + url = optarg; + break; + + case 'm': /* --model */ + model = optarg; + break; + + case CHAR_MAX + 3: /* --prompt */ + prompt = optarg; + break; + + case CHAR_MAX + 4: /* --postprocess */ + postprocess = optarg; + break; + + case 'P': + input_syntax = &input_format_properties; + break; + + case CHAR_MAX + 5: /* --stringtable-input */ + input_syntax = &input_format_stringtable; + break; + + case CHAR_MAX + 6: /* --color */ + if (handle_color_option (optarg) || color_test_mode) + usage (EXIT_FAILURE); + break; + + case CHAR_MAX + 7: /* --style */ + handle_style_option (optarg); + break; + + case CHAR_MAX + 8: /* --indent */ + message_print_style_indent (); + break; + + case CHAR_MAX + 9: /* --no-location */ + message_print_style_filepos (filepos_comment_none); + break; + + case 'n': /* -n */ + case CHAR_MAX + 'n': /* --add-location[={full|yes|file|never|no}] */ + if (handle_filepos_comment_option (optarg)) + usage (EXIT_FAILURE); + break; + + case CHAR_MAX + 10: /* --strict */ + message_print_style_uniforum (); + break; + + case 'p': + output_syntax = &output_format_properties; + break; + + case CHAR_MAX + 11: /* --stringtable-output */ + output_syntax = &output_format_stringtable; + break; + + case 'w': + { + char *endp; + int value = strtol (optarg, &endp, 10); + if (endp != optarg) + message_page_width_set (value); + } + break; + + case CHAR_MAX + 12: /* --no-wrap */ + message_page_width_ignore (); + break; + + case 's': + sort_by_msgid = true; + break; + + case 'F': + sort_by_filepos = true; + break; + + case 'h': + do_help = true; + break; + + case 'V': + do_version = true; + break; + + case 'q': /* --quiet, --silent */ + quiet = true; + break; + + default: + usage (EXIT_FAILURE); + break; + } + } + + /* Version information is requested. */ + if (do_version) + { + printf ("%s (GNU %s) %s\n", last_component (program_name), + PACKAGE, VERSION); + /* xgettext: no-wrap */ + printf (_("Copyright (C) %s Free Software Foundation, Inc.\n\ +License GPLv3+: GNU GPL version 3 or later <%s>\n\ +This is free software: you are free to change and redistribute it.\n\ +There is NO WARRANTY, to the extent permitted by law.\n\ +"), + "2001-2025", "https://gnu.org/licenses/gpl.html"); + printf (_("Written by %s.\n"), proper_name ("Bruno Haible")); + exit (EXIT_SUCCESS); + } + + /* Help is requested. */ + if (do_help) + usage (EXIT_SUCCESS); + + /* Test for extraneous arguments. */ + if (optind != argc) + error (EXIT_FAILURE, 0, _("too many arguments")); + + /* Check --species option. */ + if (strcmp (species, "ollama") != 0) + error (EXIT_FAILURE, 0, _("invalid value for %s option: %s"), + "--species", species); + + /* Check --model option. */ + if (model == NULL) + error (EXIT_FAILURE, 0, _("missing %s option"), + "--model"); + + /* Verify selected options. */ + if (sort_by_msgid && sort_by_filepos) + error (EXIT_FAILURE, 0, _("%s and %s are mutually exclusive"), + "--sort-output", "--sort-by-file"); + + /* By default, input comes from standard input. */ + if (input_file == NULL) + input_file = "-"; + + /* Read input file. */ + msgdomain_list_ty *result = read_catalog_file (input_file, input_syntax); + + /* Convert the input to UTF-8 first. */ + result = iconv_msgdomain_list (result, po_charset_utf8, true, input_file, + textmode_xerror_handler); + + /* Warn if the current locale is not suitable for this PO file. */ + compare_po_locale_charsets (result); + + /* Extract the target language from the header entry. */ + if (prompt == NULL) + { + bool header_found = false; + for (size_t k = 0; k < result->nitems; k++) + { + message_list_ty *mlp = result->item[k]->messages; + message_ty *header = message_list_search (mlp, NULL, ""); + if (header != NULL && !header->obsolete) + { + header_found = true; + const char *nullentry = header->msgstr; + const char *language = c_strstr (nullentry, "Language: "); + if (language != NULL) + { + language += 10; + + size_t len = strcspn (language, " \t\n"); + if (len > 0) + { + char *memory = (char *) malloc (len + 1); + memcpy (memory, language, len); + memory[len] = '\0'; + + to_language = memory; + break; + } + } + } + + if (to_language != NULL) + break; + } + + if (!header_found) + error (EXIT_FAILURE, 0, _("The input does not have a header entry.")); + + if (to_language == NULL) + error (EXIT_FAILURE, 0, + _("The input's header entry does not contain the '%s' header field."), + "Language"); + } + + /* The name of the subprogram. */ + sub_name = "spit"; + + /* Attempt to locate the subprogram. + This is an optimization, to avoid that spawn/exec searches the PATH + on every call. */ + sub_path = find_in_path (sub_name); + + /* Build the argument list for the subprogram. */ + sub_argv = (const char **) XNMALLOC (7, const char *); + { + sub_argv[0] = sub_path; + size_t i = 1; + + if (species != NULL) + sub_argv[i++] = xasprintf ("--species=%s", species); + + if (url != NULL) + sub_argv[i++] = xasprintf ("--url=%s", url); + + sub_argv[i++] = xasprintf ("--model=%s", model); + + if (prompt != NULL) + sub_argv[i++] = xasprintf ("--prompt=%s", prompt); + else + sub_argv[i++] = xasprintf ("--to=%s", to_language); + + if (postprocess != NULL) + sub_argv[i++] = xasprintf ("--postprocess=%s", postprocess); + + sub_argv[i] = NULL; + sub_argc = i; + } + + /* Apply the subprogram. */ + result = process_msgdomain_list (result); + + /* Sort the results. */ + if (sort_by_filepos) + msgdomain_list_sort_by_filepos (result); + else if (sort_by_msgid) + msgdomain_list_sort_by_msgid (result); + + /* Write the merged message list out. */ + msgdomain_list_print (result, output_file, output_syntax, + textmode_xerror_handler, force_po, false); + + exit (EXIT_SUCCESS); +} + + +/* Display usage information and exit. */ +static void +usage (int status) +{ + if (status != EXIT_SUCCESS) + fprintf (stderr, _("Try '%s --help' for more information.\n"), + program_name); + else + { + printf (_("\ +Usage: %s [OPTION...]\n\ +"), program_name); + printf ("\n"); + printf (_("\ +Pretranslates a translation catalog.\n\ +")); + printf ("\n"); + printf (_("\ +Warning: The pretranslations might not be what you expect.\n\ +They might be of the wrong form, be of poor quality, or reflect some biases.\n")); + printf ("\n"); + printf (_("\ +Mandatory arguments to long options are mandatory for short options too.\n")); + printf ("\n"); + printf (_("\ +Input file location:\n")); + printf (_("\ + -i, --input=INPUTFILE input PO file\n")); + printf (_("\ + -D, --directory=DIRECTORY add DIRECTORY to list for input files search\n")); + printf (_("\ +If no input file is given or if it is -, standard input is read.\n")); + printf ("\n"); + printf (_("\ +Output file location:\n")); + printf (_("\ + -o, --output-file=FILE write output to specified file\n")); + printf (_("\ +The results are written to standard output if no output file is specified\n\ +or if it is -.\n")); + printf ("\n"); + printf (_("\ +Message selection:\n")); + printf (_("\ + --keep-fuzzy Keep fuzzy messages unmodified.\n\ + Pretranslate only untranslated messages.\n")); + printf ("\n"); + printf (_("\ +Large Language Model (LLM) options:\n")); + printf (_("\ + --species=TYPE Specifies the type of LLM. The default and only\n\ + valid value is '%s'.\n"), + "ollama"); + printf (_("\ + --url=URL Specifies the URL of the server that runs the LLM.\n")); + printf (_("\ + -m, --model=MODEL Specifies the model to use.\n")); + printf (_("\ + --prompt=TEXT Specifies the prompt to use before standard input.\n")); + printf (_("\ + --postprocess=COMMAND Specifies a command to post-process the output.\n")); + printf ("\n"); + printf (_("\ +Input file syntax:\n")); + printf (_("\ + -P, --properties-input input file is in Java .properties syntax\n")); + printf (_("\ + --stringtable-input input file is in NeXTstep/GNUstep .strings syntax\n")); + printf ("\n"); + printf (_("\ +Output details:\n")); + printf (_("\ + --color use colors and other text attributes always\n\ + --color=WHEN use colors and other text attributes if WHEN.\n\ + WHEN may be 'always', 'never', 'auto', or 'html'.\n")); + printf (_("\ + --style=STYLEFILE specify CSS style rule file for --color\n")); + printf (_("\ + --force-po write PO file even if empty\n")); + printf (_("\ + --indent indented output style\n")); + printf (_("\ + --no-location suppress '#: filename:line' lines\n")); + printf (_("\ + -n, --add-location preserve '#: filename:line' lines (default)\n")); + printf (_("\ + --strict strict Uniforum output style\n")); + printf (_("\ + -p, --properties-output write out a Java .properties file\n")); + printf (_("\ + --stringtable-output write out a NeXTstep/GNUstep .strings file\n")); + printf (_("\ + -w, --width=NUMBER set output page width\n")); + printf (_("\ + --no-wrap do not break long message lines, longer than\n\ + the output page width, into several lines\n")); + printf (_("\ + -s, --sort-output generate sorted output\n")); + printf (_("\ + -F, --sort-by-file sort output by file location\n")); + printf ("\n"); + printf (_("\ +Informative output:\n")); + printf (_("\ + -h, --help display this help and exit\n")); + printf (_("\ + -V, --version output version information and exit\n")); + printf (_("\ + -q, --quiet, --silent suppress progress indicators\n")); + printf ("\n"); + /* TRANSLATORS: The first placeholder is the web address of the Savannah + project of this package. The second placeholder is the bug-reporting + email address for this package. Please add _another line_ saying + "Report translation bugs to <...>\n" with the address for translation + bugs (typically your translation team's web or email address). */ + printf (_("\ +Report bugs in the bug tracker at <%s>\n\ +or by email to <%s>.\n"), + "https://savannah.gnu.org/projects/gettext", + "bug-gettext@gnu.org"); + } + + exit (status); +} + + +/* Callbacks called from pipe_filter_ii_execute. */ + +struct locals +{ + /* String being written. */ + const char *str; + size_t len; + /* String being read and accumulated. */ + char *result; + size_t allocated; + size_t length; +}; + +static const void * +prepare_write (size_t *num_bytes_p, void *private_data) +{ + struct locals *l = (struct locals *) private_data; + + if (l->len > 0) + { + *num_bytes_p = l->len; + return l->str; + } + else + return NULL; +} + +static void +done_write (void *data_written, size_t num_bytes_written, void *private_data) +{ + struct locals *l = (struct locals *) private_data; + + l->str += num_bytes_written; + l->len -= num_bytes_written; +} + +static void * +prepare_read (size_t *num_bytes_p, void *private_data) +{ + struct locals *l = (struct locals *) private_data; + + if (l->length == l->allocated) + { + l->allocated = l->allocated + (l->allocated >> 1) + 1; + l->result = (char *) xrealloc (l->result, l->allocated); + } + *num_bytes_p = l->allocated - l->length; + return l->result + l->length; +} + +static void +done_read (void *data_read, size_t num_bytes_read, void *private_data) +{ + struct locals *l = (struct locals *) private_data; + + l->length += num_bytes_read; +} + + +/* Process a string STR of size LEN bytes through the subprogram. + Store the freshly allocated result at *RESULTP and its length at *LENGTHP. + */ +static void +generic_filter (const char *str, size_t len, char **resultp, size_t *lengthp) +{ + struct locals l; + l.str = str; + l.len = len; + l.allocated = len + (len >> 2) + 1; + l.result = XNMALLOC (l.allocated, char); + l.length = 0; + + pipe_filter_ii_execute (sub_name, sub_path, sub_argv, false, true, + prepare_write, done_write, prepare_read, done_read, + &l); + + *resultp = l.result; + *lengthp = l.length; +} + + +/* Process a string STR of size LEN bytes, then remove NUL bytes. + Store the freshly allocated result at *RESULTP and its length at *LENGTHP. + */ +static void +process_string (const char *str, size_t len, char **resultp, size_t *lengthp) +{ + char *result; + size_t length; + generic_filter (str, len, &result, &length); + + /* Remove NUL bytes from result. */ + { + char *p = result; + char *pend = result + length; + + for (; p < pend; p++) + if (*p == '\0') + { + char *q = p; + for (; p < pend; p++) + if (*p != '\0') + *q++ = *p; + length = q - result; + break; + } + } + + *resultp = result; + *lengthp = length; +} + + +/* Number of messages processed so far. */ +static size_t messages_processed; + + +static void +process_message (message_ty *mp) +{ + /* Keep the header entry unmodified. */ + if (is_header (mp)) + return; + + /* Ignore obsolete messages. */ + if (mp->obsolete) + return; + + /* Translate only untranslated or, if --keep-fuzzy is not specified, fuzzy + messages. */ + if (!(mp->msgstr[0] == '\0' + || (mp->is_fuzzy && !keep_fuzzy))) + return; + + /* Because querying a Large Language Model can take a while + we print something to signal we are not dead. */ + if (!quiet) + { + fputc ('.', stderr); + messages_processed++; + } + + /* Take the msgid. + For a plural message, take the msgid_plural and repeat its translation + for each of the plural forms. Let the translator work out the plural + forms. */ + const char *msgid = (mp->msgid_plural != NULL ? mp->msgid_plural : mp->msgid); + + char *result; + size_t length; + process_string (msgid, strlen (msgid), &result, &length); + + /* Avoid an error later, during "msgfmt --check", due to a trailing newline. */ + if (strlen (msgid) > 0 && msgid[strlen (msgid) - 1] == '\n') + { + /* msgid ends in a newline. Ensure that the result ends in a newline + as well. */ + if (!(length > 0 && result[length - 1] == '\n')) + { + result = (char *) xrealloc (result, length + 1); + result[length] = '\n'; + length++; + } + } + else + { + /* msgid does not end in a newline. Ensure that the same holds for the + result. */ + while (length > 0 && result[length - 1] == '\n') + length--; + } + + /* Count the number of plural forms. */ + size_t nplurals; + { + const char *msgstr = mp->msgstr; + size_t msgstr_len = mp->msgstr_len; + nplurals = 0; + for (const char *p = msgstr; p < msgstr + msgstr_len; p += strlen (p) + 1) + nplurals++; + } + + /* Produce nplurals copies of the result, each with an added NUL. */ + size_t msgstr_len = nplurals * (length + 1); + char *msgstr = XNMALLOC (msgstr_len, char); + { + char *p; + size_t k; + for (p = msgstr, k = 0; k < nplurals; k++) + { + memcpy (p, result, length); + p += length; + *p++ = '\0'; + } + } + + mp->msgstr = msgstr; + mp->msgstr_len = msgstr_len; + + /* Mark the message as fuzzy, so that the translator can review it. */ + mp->is_fuzzy = (msgstr_len > 0); +} + + +static void +process_message_list (message_list_ty *mlp) +{ + for (size_t j = 0; j < mlp->nitems; j++) + process_message (mlp->item[j]); +} + + +static msgdomain_list_ty * +process_msgdomain_list (msgdomain_list_ty *mdlp) +{ + messages_processed = 0; + + for (size_t k = 0; k < mdlp->nitems; k++) + process_message_list (mdlp->item[k]->messages); + + if (messages_processed > 0) + fputc ('\n', stderr); + + return mdlp; +}