From: Jim Hague Date: Wed, 27 Dec 2017 14:55:02 +0000 (+0000) Subject: eit: allow scraper regexes to be marked as filters (#4818) X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=715a4a84439c7a71063053ff5ec120fc27bdc683;p=thirdparty%2Ftvheadend.git eit: allow scraper regexes to be marked as filters (#4818) Introduce an extended scraper regex syntax; as well as a string, a regex can be specified as a map. The map must have an entry "pattern" with the regex pattern. It may also have an entry "filter", with a numeric value. If this value is not 0, the regular expression is a filter. If a filter regular expression matches the input text, the result of the match replaces the input text, and processing continues from the next regular expression with that new input text. Issue: 4818 --- diff --git a/data/conf/epggrab/eit/scrape/README b/data/conf/epggrab/eit/scrape/README index 41ad82c3d..8c57e63c1 100644 --- a/data/conf/epggrab/eit/scrape/README +++ b/data/conf/epggrab/eit/scrape/README @@ -24,7 +24,7 @@ of the match is the contents of all the sub-patterns in the regular expression concatenated together. For each EPG episode, the title, description and summary are matched -in turn against the season_num, episode_num, airdate and is_new regexes. +in turn against the season_num, episode_num, airdate and is_new regex lists. - season_num converts the contents of the match result to an integer, and if successful sets the EPG season number. @@ -37,14 +37,56 @@ in turn against the season_num, episode_num, airdate and is_new regexes. the match result is ignored. Next, a combined title/summary text is made by joining the title, a space, -and the summary. The combined text is matched against the scrape_title regex. -On a match, the EPG title is set to the match result. +and the summary. The combined text is matched against the scrape_title regex +list. On a match, the EPG title is set to the match result. -Then the summary is matched against the scrape_subtitle regex. On a match, +Then the summary is matched against the scrape_subtitle regex list. On a match, the EPG subtitle is set to the match result. -Finally, the summary is matched against the scrape_summary regex. On a match, -the EPG summary is set to the match result. +Finally, the summary is matched against the scrape_summary regex list. On a +match, the EPG summary is set to the match result. + +Filtering regular expressions +----------------------------- + +Any regular expression in a list can be marked as a filtering regular +expression. If the regular expression is marked as a filter, and it matches +the input text, then the match result is not returned as a result, but +instead replaces the original text to match, and matching continues with the +next regular expression in the list. If a filter regular expression does +not match, matching moves to the next regular expression in the list as +usual. + +To mark a regular expression as a filter, it must be specified with an +expanded definition with a "pattern" component. This is the regular expression +pattern. It may also have an optional numeric "filter" component. If present, +and not 0, the regular expression is a filter. + +For example, in the following list, the first regex is a filter that +removes any first sentence starting "...". The following regexs see +only the text following that sentence. + +{ + "scrape_subtitle": [ + { + "pattern": "^[.][.][.][^:.?!]*[.:?!] +(.*)", + "filter": 1 + }, + { + "pattern": "^[0-9]+/[0-9]+[.] +(.*)", + "filter": 1 + }, + "^([^:]+): " + ] +} + +Given any of the following input texts, the above regex list matches +'Subtitle here': + +...Continued title. 1/6. Subtitle here: rest of summary +...Continued title. Subtitle here: rest of summary +1/6. Subtitle here: rest of summary +Subtitle here: rest of summary Regular expression engine ------------------------- diff --git a/src/epggrab/module/eitpatternlist.c b/src/epggrab/module/eitpatternlist.c index 15c1aeb42..951141303 100644 --- a/src/epggrab/module/eitpatternlist.c +++ b/src/epggrab/module/eitpatternlist.c @@ -22,25 +22,36 @@ #include "eitpatternlist.h" #include "htsmsg.h" +#define MAX_TEXT_LEN 2048 + void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l, int flags ) { eit_pattern_t *pattern; htsmsg_field_t *f; const char *s; + int filter; TAILQ_INIT(list); if (!l) return; HTSMSG_FOREACH(f, l) { s = htsmsg_field_get_str(f); - if (s == NULL) continue; + filter = 0; + if (s == NULL) { + htsmsg_t *m = htsmsg_field_get_map(f); + if (m == NULL) continue; + s = htsmsg_get_str(m, "pattern"); + if (s == NULL) continue; + filter = htsmsg_get_bool_or_default(m, "filter", 0); + } pattern = calloc(1, sizeof(eit_pattern_t)); pattern->text = strdup(s); + pattern->filter = filter; if (regex_compile(&pattern->compiled, pattern->text, flags, LS_EPGGRAB)) { tvhwarn(LS_EPGGRAB, "error compiling pattern \"%s\"", pattern->text); free(pattern->text); free(pattern); } else { - tvhtrace(LS_EPGGRAB, "compiled pattern \"%s\"", pattern->text); + tvhtrace(LS_EPGGRAB, "compiled pattern \"%s\", filter %d", pattern->text, pattern->filter); TAILQ_INSERT_TAIL(list, pattern, p_links); } } @@ -74,7 +85,8 @@ static void rtrim(char *buf) void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l) { eit_pattern_t *p; - char matchbuf[2048]; + char textbuf[MAX_TEXT_LEN]; + char matchbuf[MAX_TEXT_LEN]; int matchno; assert(buf); @@ -93,7 +105,13 @@ void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_p strncat(buf, matchbuf, size_buf - len - 1); } rtrim(buf); - tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s'", p->text, buf); + tvhtrace(LS_EPGGRAB," pattern \"%s\" matches '%s' from '%s'", p->text, buf, text); + if (p->filter) { + strncpy(textbuf, buf, MAX_TEXT_LEN - 1); + textbuf[MAX_TEXT_LEN - 1] = '\0'; + text = textbuf; + continue; + } return buf; } return NULL; diff --git a/src/epggrab/module/eitpatternlist.h b/src/epggrab/module/eitpatternlist.h index dc6a02416..c7c24519b 100644 --- a/src/epggrab/module/eitpatternlist.h +++ b/src/epggrab/module/eitpatternlist.h @@ -26,6 +26,7 @@ typedef struct eit_pattern { char *text; tvh_regex_t compiled; + int filter; TAILQ_ENTRY(eit_pattern) p_links; } eit_pattern_t; diff --git a/support/eitscrape_test.py b/support/eitscrape_test.py index af886b901..19233ec52 100755 --- a/support/eitscrape_test.py +++ b/support/eitscrape_test.py @@ -64,11 +64,16 @@ except ImportError: class Regex(object): def __init__(self, engine, regex): self.engine = engine - self.regex = regex + if isinstance(regex, dict): + self.regex = regex["pattern"] + self.re_is_filter = (regex["filter"] != 0) + else: + self.regex = regex + self.re_is_filter = False flags = re_base_flag if not engine: flags |= re_posix_flag - self.regcomp = re.compile(regex, flags) + self.regcomp = re.compile(self.regex, flags) def search(self, text): match = self.regcomp.search(text) @@ -101,6 +106,9 @@ class EITScrapeTest(object): for regex in regexes: result = regex.search(text) if result is not None: + if regex.re_is_filter: + text = result + continue if result == expect: print 'OK: Got correct result of "{result}" testing "{testing}" for "{text}" using "{pattern}"'.format(result=result, testing=testing, text=text, pattern=regex.text()) self.num_ok = self.num_ok + 1