From: Jim Hague Date: Thu, 28 Dec 2017 20:45:33 +0000 (+0000) Subject: eit: add language identifier for scraper regexes (#4820) X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=dddcf7c88d730a1c2e9981d8f74cad51fb1325fd;p=thirdparty%2Ftvheadend.git eit: add language identifier for scraper regexes (#4820) Allow scraper regexes to be tagged with a single language identifier or a list of language identifiers. Tagged regexes will only be used against input text in languages matching the tag. Otherwise the regex is skipped. Issue: #4820 --- diff --git a/data/conf/epggrab/eit/scrape/README b/data/conf/epggrab/eit/scrape/README index 8c57e63c1..37061baae 100644 --- a/data/conf/epggrab/eit/scrape/README +++ b/data/conf/epggrab/eit/scrape/README @@ -122,6 +122,38 @@ A useful reference on the differences between POSIX, PCRE and PCRE2 regular expressions is at http://www.regular-expressions.info/refbasic.html. +Languages +--------- + +By default, regular expressions are applied to input text in any language. +You can specify that a regular expression is appropriate only for a +particular language or group of languages by using an expanded definition +with a "lang" component. This component may be either a string with a single +language identifier, or a list of language identifier strings. For +example: + +{ + "scrape_subtitle": [ + { + "pattern": "^[.][.][.][^:.?!]*[.:?!] +(.*)", + "lang": "eng" + }, + { + "pattern": "^[0-9]+/[0-9]+[.] +(.*)", + "lang": ["eng", "fre"] + }, + "^([^:]+): " + ] +} + +If the regular expression is marked with a language or group of languages, +and the input text language does not match one of those specified for +the regular expression, the regular expression is ignored and processing +continues with the next regular expression in the list. + +Language codes must be 3 character ISO 639-2 B codes as listed in +src/lang_codes.c. + Testing ------- diff --git a/src/epggrab/module/eit.c b/src/epggrab/module/eit.c index 17b4f4095..d6ced7625 100644 --- a/src/epggrab/module/eit.c +++ b/src/epggrab/module/eit.c @@ -480,7 +480,7 @@ _eit_scrape_episode(lang_str_t *str, /* search for season number */ RB_FOREACH(se, str, link) { - if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_snum)) + if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_snum)) if ((ev->en.s_num = positive_atoi(buffer))) { tvhtrace(LS_TBL_EIT," extract season number %d using %s", ev->en.s_num, eit_mod->id); break; @@ -489,7 +489,7 @@ _eit_scrape_episode(lang_str_t *str, /* ...for episode number */ RB_FOREACH(se, str, link) { - if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_enum)) + if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_enum)) if ((ev->en.e_num = positive_atoi(buffer))) { tvhtrace(LS_TBL_EIT," extract episode number %d using %s", ev->en.e_num, eit_mod->id); break; @@ -498,7 +498,7 @@ _eit_scrape_episode(lang_str_t *str, /* Extract original air date year */ RB_FOREACH(se, str, link) { - if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_airdate)) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_airdate)) { if (strlen(buffer) == 4) { /* Year component only, so assume it is the copyright year. */ ev->copyright_year = positive_atoi(buffer); @@ -509,7 +509,7 @@ _eit_scrape_episode(lang_str_t *str, /* Extract is_new flag. Any match is assumed to mean "new" */ RB_FOREACH(se, str, link) { - if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_is_new)) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_is_new)) { ev->is_new = 1; break; } @@ -540,7 +540,7 @@ _eit_scrape_text(eit_module_t *eit_mod, eit_event_t *ev) RB_FOREACH(se, ev->title, link) { snprintf(title_summary, sizeof(title_summary), "%s %s", se->str, lang_str_get(ev->summary, se->lang)); - if (eit_pattern_apply_list(buffer, sizeof(buffer), title_summary, &eit_mod->p_scrape_title)) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), title_summary, se->lang, &eit_mod->p_scrape_title)) { tvhtrace(LS_TBL_EIT, " scrape title '%s' from '%s' using %s", buffer, title_summary, eit_mod->id); lang_str_set(&ls, buffer, se->lang); @@ -552,7 +552,7 @@ _eit_scrape_text(eit_module_t *eit_mod, eit_event_t *ev) if (eit_mod->scrape_subtitle) { RB_FOREACH(se, ev->summary, link) { - if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_subtitle)) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_scrape_subtitle)) { tvhtrace(LS_TBL_EIT, " scrape subtitle '%s' from '%s' using %s", buffer, se->str, eit_mod->id); lang_str_set(&ev->subtitle, buffer, se->lang); @@ -563,7 +563,7 @@ _eit_scrape_text(eit_module_t *eit_mod, eit_event_t *ev) if (eit_mod->scrape_summary) { lang_str_t *ls = lang_str_create(); RB_FOREACH(se, ev->summary, link) { - if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_summary)) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_scrape_summary)) { tvhtrace(LS_TBL_EIT, " scrape summary '%s' from '%s' using %s", buffer, se->str, eit_mod->id); lang_str_set(&ls, buffer, se->lang); diff --git a/src/epggrab/module/eitpatternlist.c b/src/epggrab/module/eitpatternlist.c index 951141303..442561594 100644 --- a/src/epggrab/module/eitpatternlist.c +++ b/src/epggrab/module/eitpatternlist.c @@ -24,30 +24,66 @@ #define MAX_TEXT_LEN 2048 +static char *get_languages_string(htsmsg_field_t *field) +{ + const char *s; + htsmsg_t *langlist; + + if (field == NULL) + return NULL; + + s = htsmsg_field_get_str(field); + if (s) { + return strdup(s); + } else { + langlist = htsmsg_field_get_list(field); + if (langlist) { + htsmsg_field_t *item; + char langbuf[MAX_TEXT_LEN]; + langbuf[0] = '\0'; + HTSMSG_FOREACH(item, langlist) { + s = htsmsg_field_get_str(item); + if (s) { + strncat(langbuf, s, sizeof(langbuf) - strlen(langbuf) - 1); + strncat(langbuf, "|", sizeof(langbuf) - strlen(langbuf) - 1); + } + } + if (strlen(langbuf) > 0) + return strdup(langbuf); + } + } + return NULL; +} + void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l, int flags ) { eit_pattern_t *pattern; htsmsg_field_t *f; - const char *s; + const char *text; int filter; + char *langs; TAILQ_INIT(list); if (!l) return; HTSMSG_FOREACH(f, l) { - s = htsmsg_field_get_str(f); + text = htsmsg_field_get_str(f); filter = 0; - if (s == NULL) { + langs = NULL; + if (text == NULL) { htsmsg_t *m = htsmsg_field_get_map(f); if (m == NULL) continue; - s = htsmsg_get_str(m, "pattern"); - if (s == NULL) continue; + text = htsmsg_get_str(m, "pattern"); + if (text == NULL) continue; filter = htsmsg_get_bool_or_default(m, "filter", 0); + langs = get_languages_string(htsmsg_field_find(m, "lang")); } pattern = calloc(1, sizeof(eit_pattern_t)); - pattern->text = strdup(s); + pattern->text = strdup(text); pattern->filter = filter; + pattern->langs = langs; if (regex_compile(&pattern->compiled, pattern->text, flags, LS_EPGGRAB)) { tvhwarn(LS_EPGGRAB, "error compiling pattern \"%s\"", pattern->text); + free(pattern->langs); free(pattern->text); free(pattern); } else { @@ -82,7 +118,7 @@ static void rtrim(char *buf) buf[len] = '\0'; } -void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l) +void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, const char *lang, eit_pattern_list_t *l) { eit_pattern_t *p; char textbuf[MAX_TEXT_LEN]; @@ -95,7 +131,12 @@ void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_p if (!l) return NULL; /* search and concatenate all subgroup matches - there must be at least one */ - TAILQ_FOREACH(p, l, p_links) + TAILQ_FOREACH(p, l, p_links) { + if (p->langs && lang) { + if (strstr(p->langs, lang) == NULL) { + continue; + } + } if (!regex_match(&p->compiled, text) && !regex_match_substring(&p->compiled, 1, buf, size_buf)) { for (matchno = 2; ; ++matchno) { @@ -114,6 +155,7 @@ void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_p } return buf; } + } return NULL; } @@ -124,6 +166,7 @@ void eit_pattern_free_list ( eit_pattern_list_t *l ) if (!l) return; while ((p = TAILQ_FIRST(l)) != NULL) { TAILQ_REMOVE(l, p, p_links); + free(p->langs); free(p->text); regex_free(&p->compiled); free(p); diff --git a/src/epggrab/module/eitpatternlist.h b/src/epggrab/module/eitpatternlist.h index c7c24519b..369206e95 100644 --- a/src/epggrab/module/eitpatternlist.h +++ b/src/epggrab/module/eitpatternlist.h @@ -27,6 +27,7 @@ typedef struct eit_pattern char *text; tvh_regex_t compiled; int filter; + char *langs; TAILQ_ENTRY(eit_pattern) p_links; } eit_pattern_t; @@ -45,6 +46,6 @@ void eit_pattern_compile_named_list ( eit_pattern_list_t *list, htsmsg_t *m, con * match in buf which is of size size_buf. * Return the buf or NULL if no match. */ -void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l); +void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, const char *lang, eit_pattern_list_t *l); void eit_pattern_free_list ( eit_pattern_list_t *l ); #endif diff --git a/src/epggrab/module/opentv.c b/src/epggrab/module/opentv.c index 82b54d32d..09311cdfd 100644 --- a/src/epggrab/module/opentv.c +++ b/src/epggrab/module/opentv.c @@ -371,7 +371,7 @@ opentv_parse_event_section_one tvhdebug(LS_OPENTV, " title '%s'", ev.title); /* try to cleanup the title */ - if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.title, &mod->p_cleanup_title)) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.title, lang, &mod->p_cleanup_title)) { tvhtrace(LS_OPENTV, " clean title '%s'", buffer); s = buffer; } else { @@ -392,15 +392,15 @@ opentv_parse_event_section_one memset(&en, 0, sizeof(en)); /* search for season number */ - if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, &mod->p_snum)) + if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, lang, &mod->p_snum)) if ((en.s_num = atoi(buffer))) tvhtrace(LS_OPENTV," extract season number %d", en.s_num); /* ...for episode number */ - if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, &mod->p_enum)) + if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, lang, &mod->p_enum)) if ((en.e_num = atoi(buffer))) tvhtrace(LS_OPENTV," extract episode number %d", en.e_num); /* ...for part number */ - if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, &mod->p_pnum)) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, lang, &mod->p_pnum)) { if (buffer[0] >= 'a' && buffer[0] <= 'z') en.p_num = buffer[0] - 'a' + 1; else @@ -414,7 +414,7 @@ opentv_parse_event_section_one save |= epg_episode_set_epnum(ee, &en, &changes3); /* ...for subtitle */ - if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, &mod->p_subt)) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, lang, &mod->p_subt)) { tvhtrace(LS_OPENTV, " extract subtitle '%s'", buffer); ls = lang_str_create2(buffer, lang); save |= epg_episode_set_subtitle(ee, ls, &changes3); diff --git a/support/eitscrape_test.py b/support/eitscrape_test.py index 19233ec52..1c95d6043 100755 --- a/support/eitscrape_test.py +++ b/support/eitscrape_test.py @@ -66,10 +66,21 @@ class Regex(object): self.engine = engine if isinstance(regex, dict): self.regex = regex["pattern"] - self.re_is_filter = (regex["filter"] != 0) + try: + self.re_is_filter = (regex["filter"] != 0) + except KeyError: + self.re_is_filter = False + try: + if isinstance(regex["lang"], str): + self.lang = [ regex["lang"] ] + else: + self.lang = regex["lang"] + except KeyError: + self.lang = None else: self.regex = regex self.re_is_filter = False + self.lang = None flags = re_base_flag if not engine: flags |= re_posix_flag @@ -99,11 +110,13 @@ class EITScrapeTest(object): self.num_failed = 0; self.num_ok = 0; - def run_test_case_i(self, text, regexes, expect, testing): + def run_test_case_i(self, text, lang, regexes, expect, testing): """Run a test case for text using the regular expression lists in reg, expecting the result of a match to be expect while running a test case for the string testing.""" for regex in regexes: + if lang and regex.lang and lang not in regex.lang: + continue result = regex.search(text) if result is not None: if regex.re_is_filter: @@ -138,7 +151,7 @@ class EITScrapeTest(object): canonical, _, for_engine = key.partition(':') if for_engine and for_engine != engine: continue - if canonical in ('comment', 'summary', 'title'): + if canonical in ('comment', 'summary', 'title', 'language'): continue if canonical in ('age', 'genre'): print 'Test case contains key "{key}" which is not currently tested for "{test}"'.format(key=key, test=test) @@ -158,8 +171,12 @@ class EITScrapeTest(object): text = test['summary'] if canonical == 'new_title': text = test['title'] + ' ' + text + if 'language' in test: + lang = test['language'] + else: + lang = None if regexes[canonical]: - self.run_test_case_i(text, regexes[canonical], test[key], key) + self.run_test_case_i(text, lang, regexes[canonical], test[key], key) else: print 'FAIL: no regex defined for key "{key}"'.format(key=canonical) self.num_failed = self.num_failed + 1 diff --git a/support/testdata/eitscrape/README b/support/testdata/eitscrape/README index bc3bd0aea..5d6226533 100644 --- a/support/testdata/eitscrape/README +++ b/support/testdata/eitscrape/README @@ -15,6 +15,8 @@ Input: - "summary" - summary field from EIT broadcast that will be scraped. +- "language" - optional 3 character ISO 639-2 B language code + string specifying the title and summary language. Expected Result: