From: Jim Hague Date: Thu, 14 Dec 2017 14:15:11 +0000 (+0000) Subject: eit: extend generic regex handling for subpatterns and use in scraper regex (#4795) X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=58611c8bc5881db6d07516f93b86209d11e6ee13;p=thirdparty%2Ftvheadend.git eit: extend generic regex handling for subpatterns and use in scraper regex (#4795) Currently scraper regex usage is hardwired to Posix. Using PCRE/PCRE2 if available would give more flexibility and potentially save repetition in patterns, e.g. (?:[.][.][.][:.]*[.:]|[0-9]+/[0-9]+[.])? ([^:]*): would require multiple Posix patterns, each duplicating the captured subpattern. So add regex_match_substring() and regex_match_substring_length() to the TVH regex interface. Also add a flags parameter to regex_compile(), so caseless matching can be optionally requested, rather than hardwired as at present (EIT scraper regex does not use caseless). One small change to EIT scraper processing. If the match does not fit into the buffer, it will be ignored, rather than (as at present) truncated. This is slightly simpler to implement with PCRE2. I am not convinced truncation is useful - or, for that matter, that trimming space from the right hand end of match in the EIT scraper is necessary or necessarily desirable, but I've left that in. Issue: #4795 --- diff --git a/src/dvr/dvr_autorec.c b/src/dvr/dvr_autorec.c index 82d85d524..4b4578c29 100644 --- a/src/dvr/dvr_autorec.c +++ b/src/dvr/dvr_autorec.c @@ -583,7 +583,7 @@ dvr_autorec_entry_class_title_set(void *o, const void *v) if (dae->dae_title) autorec_regfree(dae); dae->dae_error = 0; - if (!regex_compile(&dae->dae_title_regex, title, LS_DVR)) + if (!regex_compile(&dae->dae_title_regex, title, TVHREGEX_CASELESS, LS_DVR)) dae->dae_title = strdup(title); else dae->dae_error = 1; diff --git a/src/epg.c b/src/epg.c index e4ce0d3c2..9262a390f 100644 --- a/src/epg.c +++ b/src/epg.c @@ -3016,7 +3016,7 @@ static int _eq_init_str( epg_filter_str_t *f ) { if (f->comp != EC_RE) return 0; - return regex_compile(&f->re, f->str, LS_EPG); + return regex_compile(&f->re, f->str, TVHREGEX_CASELESS, LS_EPG); } static void @@ -3220,7 +3220,7 @@ epg_query ( epg_query_t *eq, access_t *perm ) if (_eq_init_str(&eq->channel_name)) goto fin; if (eq->stitle) - if (regex_compile(&eq->stitle_re, eq->stitle, LS_EPG)) + if (regex_compile(&eq->stitle_re, eq->stitle, TVHREGEX_CASELESS, LS_EPG)) goto fin; channel = channel_find_by_uuid(eq->channel) ?: diff --git a/src/epggrab/module/eitpatternlist.c b/src/epggrab/module/eitpatternlist.c index 3b1538d81..b3998990d 100644 --- a/src/epggrab/module/eitpatternlist.c +++ b/src/epggrab/module/eitpatternlist.c @@ -35,7 +35,7 @@ void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l ) if (s == NULL) continue; pattern = calloc(1, sizeof(eit_pattern_t)); pattern->text = strdup(s); - if (regcomp(&pattern->compiled, pattern->text, REG_EXTENDED)) { + if (regex_compile(&pattern->compiled, pattern->text, 0, LS_EPGGRAB)) { tvhwarn(LS_EPGGRAB, "error compiling pattern \"%s\"", pattern->text); free(pattern->text); free(pattern); @@ -53,11 +53,17 @@ void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_p return eit_pattern_apply_list_2(b, s, text, l); } +static void rtrim(char *buf) +{ + size_t len = strlen(buf); + while (len > 0 && isspace(buf[len - 1])) + --len; + buf[len] = '\0'; +} + void *eit_pattern_apply_list_2(char *buf[2], size_t size_buf[2], const char *text, eit_pattern_list_t *l) { - regmatch_t match[3]; eit_pattern_t *p; - ssize_t size; assert(buf[0]); assert(text); @@ -65,26 +71,15 @@ void *eit_pattern_apply_list_2(char *buf[2], size_t size_buf[2], const char *tex if (!l) return NULL; /* search and report the first match */ TAILQ_FOREACH(p, l, p_links) - if (!regexec(&p->compiled, text, 3, match, 0) && match[1].rm_so != -1) { - size = MIN(match[1].rm_eo - match[1].rm_so, size_buf[0] - 1); - if (size > 0) { - while (isspace(text[match[1].rm_so + size - 1])) - size--; - memcpy(buf[0], text + match[1].rm_so, size); - } - buf[0][size] = '\0'; - if (match[2].rm_so != -1 && buf[1]) { - size = MIN(match[2].rm_eo - match[2].rm_so, size_buf[1] - 1); - if (size > 0) { - while (isspace(text[match[2].rm_so + size - 1])) - size--; - memcpy(buf[1], text + match[2].rm_so, size); - } - buf[1][size] = '\0'; - tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s' & '%s'", p->text, buf[0], buf[1]); + if (!regex_match(&p->compiled, text) && + !regex_match_substring(&p->compiled, 1, buf[0], size_buf[0])) { + rtrim(buf[0]); + if (buf[1] && !regex_match_substring(&p->compiled, 2, buf[1], size_buf[1])) { + rtrim(buf[1]); + tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s' & '%s'", p->text, buf[0], buf[1]); } else { - buf[1] = NULL; - tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s'", p->text, buf[0]); + buf[1] = NULL; + tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s'", p->text, buf[0]); } return buf[0]; } @@ -99,7 +94,7 @@ void eit_pattern_free_list ( eit_pattern_list_t *l ) while ((p = TAILQ_FIRST(l)) != NULL) { TAILQ_REMOVE(l, p, p_links); free(p->text); - regfree(&p->compiled); + regex_free(&p->compiled); free(p); } } diff --git a/src/epggrab/module/eitpatternlist.h b/src/epggrab/module/eitpatternlist.h index b779f4125..c48912f27 100644 --- a/src/epggrab/module/eitpatternlist.h +++ b/src/epggrab/module/eitpatternlist.h @@ -19,8 +19,8 @@ #ifndef __EITPATTERN_LIST__ #define __EITPATTERN_LIST__ -#include #include "queue.h" +#include "tvhregex.h" typedef struct eit_pattern { diff --git a/src/tvhregex.h b/src/tvhregex.h index 72b941456..66cff4dc0 100644 --- a/src/tvhregex.h +++ b/src/tvhregex.h @@ -37,10 +37,18 @@ #endif +#define TVHREGEX_MAX_MATCHES 10 + +/* Compile flags */ +#define TVHREGEX_CASELESS 1 + typedef struct { #if ENABLE_PCRE pcre *re_code; pcre_extra *re_extra; + int re_match[TVHREGEX_MAX_MATCHES * 3]; + const char *re_text; + int re_matches; #if PCRE_STUDY_JIT_COMPILE pcre_jit_stack *re_jit_stack; #endif @@ -51,24 +59,15 @@ typedef struct { pcre2_jit_stack *re_jit_stack; #else regex_t re_code; + regmatch_t re_match[TVHREGEX_MAX_MATCHES]; + const char *re_text; #endif } tvh_regex_t; -static inline int regex_match(tvh_regex_t *regex, const char *str) -{ -#if ENABLE_PCRE - int vec[30]; - return pcre_exec(regex->re_code, regex->re_extra, - str, strlen(str), 0, 0, vec, ARRAY_SIZE(vec)) < 0; -#elif ENABLE_PCRE2 - return pcre2_match(regex->re_code, (PCRE2_SPTR8)str, -1, 0, 0, - regex->re_match, regex->re_mcontext) <= 0; -#else - return regexec(®ex->re_code, str, 0, NULL, 0); -#endif -} - void regex_free(tvh_regex_t *regex); -int regex_compile(tvh_regex_t *regex, const char *re_str, int subsys); +int regex_compile(tvh_regex_t *regex, const char *re_str, int flags, int subsys); +int regex_match(tvh_regex_t *regex, const char *str); +int regex_match_substring(tvh_regex_t *regex, unsigned number, char *buf, size_t size_buf); +int regex_match_substring_length(tvh_regex_t *regex, unsigned number); #endif /* __TVHREGEX_H__ */ diff --git a/src/wrappers.c b/src/wrappers.c index 36b434337..52159dc3f 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -1,5 +1,6 @@ #define __USE_GNU #include "tvheadend.h" +#include #include #include /* See NOTES */ #include @@ -447,6 +448,7 @@ void regex_free(tvh_regex_t *regex) pcre_free(regex->re_code); regex->re_extra = NULL; regex->re_code = NULL; + regex->re_text = NULL; #elif ENABLE_PCRE2 pcre2_jit_stack_free(regex->re_jit_stack); pcre2_match_data_free(regex->re_match); @@ -458,20 +460,23 @@ void regex_free(tvh_regex_t *regex) regex->re_jit_stack = NULL; #else regfree(®ex->re_code); + regex->re_text = NULL; #endif } -int regex_compile(tvh_regex_t *regex, const char *re_str, int subsys) +int regex_compile(tvh_regex_t *regex, const char *re_str, int flags, int subsys) { #if ENABLE_PCRE const char *estr; int eoff; + int options = PCRE_UTF8; + if (flags & TVHREGEX_CASELESS) + options |= PCRE_CASELESS; #if PCRE_STUDY_JIT_COMPILE regex->re_jit_stack = NULL; #endif regex->re_extra = NULL; - regex->re_code = pcre_compile(re_str, PCRE_CASELESS | PCRE_UTF8, - &estr, &eoff, NULL); + regex->re_code = pcre_compile(re_str, options, &estr, &eoff, NULL); if (regex->re_code == NULL) { tvherror(subsys, "Unable to compile PCRE '%s': %s", re_str, estr); } else { @@ -494,18 +499,21 @@ int regex_compile(tvh_regex_t *regex, const char *re_str, int subsys) int ecode; PCRE2_SIZE eoff; size_t jsz; + uint32_t options; assert(regex->re_jit_stack == NULL); regex->re_jit_stack = NULL; regex->re_match = NULL; regex->re_mcontext = pcre2_match_context_create(NULL); - regex->re_code = pcre2_compile((PCRE2_SPTR8)re_str, -1, - PCRE2_CASELESS | PCRE2_UTF, + options = PCRE2_UTF; + if (flags & TVHREGEX_CASELESS) + options |= PCRE2_CASELESS; + regex->re_code = pcre2_compile((PCRE2_SPTR8)re_str, -1, options, &ecode, &eoff, NULL); if (regex->re_code == NULL) { (void)pcre2_get_error_message(ecode, ebuf, 120); tvherror(subsys, "Unable to compile PCRE2 '%s': %s", re_str, ebuf); } else { - regex->re_match = pcre2_match_data_create(20, NULL); + regex->re_match = pcre2_match_data_create(TVHREGEX_MAX_MATCHES, NULL); if (re_str[0] && pcre2_jit_compile(regex->re_code, PCRE2_JIT_COMPLETE) >= 0) { jsz = 0; if (pcre2_pattern_info(regex->re_code, PCRE2_INFO_JITSIZE, &jsz) >= 0 && jsz > 0) { @@ -518,10 +526,76 @@ int regex_compile(tvh_regex_t *regex, const char *re_str, int subsys) } return -1; #else - if (!regcomp(®ex->re_code, re_str, - REG_ICASE | REG_EXTENDED | REG_NOSUB)) + int options = REG_EXTENDED; + if (flags & TVHREGEX_CASELESS) + options |= REG_ICASE; + if (!regcomp(®ex->re_code, re_str, options)) return 0; tvherror(subsys, "Unable to compile regex '%s'", re_str); return -1; #endif } + +int regex_match(tvh_regex_t *regex, const char *str) +{ +#if ENABLE_PCRE + regex->re_text = str; + regex->re_matches = + pcre_exec(regex->re_code, regex->re_extra, + str, strlen(str), 0, 0, regex->re_match, TVHREGEX_MAX_MATCHES * 3); + return regex->re_matches < 0; +#elif ENABLE_PCRE2 + return pcre2_match(regex->re_code, (PCRE2_SPTR8)str, -1, 0, 0, + regex->re_match, regex->re_mcontext) <= 0; +#else + regex->re_text = str; + return regexec(®ex->re_code, str, TVHREGEX_MAX_MATCHES, regex->re_match, 0); +#endif +} + +int regex_match_substring(tvh_regex_t *regex, unsigned number, char *buf, size_t size_buf) +{ + assert(buf); + if (number >= TVHREGEX_MAX_MATCHES) + return -2; +#if ENABLE_PCRE + return pcre_copy_substring(regex->re_text, regex->re_match, + (regex->re_matches == 0) + ? TVHREGEX_MAX_MATCHES + : regex->re_matches, + number, buf, size_buf) < 0; +#elif ENABLE_PCRE2 + PCRE2_SIZE psiz = size_buf; + return pcre2_substring_copy_bynumber(regex->re_match, number, (PCRE2_UCHAR8*)buf, &psiz); +#else + if (regex->re_match[number].rm_so == -1) + return -1; + ssize_t size = regex->re_match[number].rm_eo - regex->re_match[number].rm_so; + if (size < 0 || size > (size_buf - 1)) + return -1; + memcpy(buf, regex->re_text + regex->re_match[number].rm_so, size); + buf[size] = '\0'; + return 0; +#endif +} + +int regex_match_substring_length(tvh_regex_t *regex, unsigned number) +{ + if (number >= TVHREGEX_MAX_MATCHES) + return -2; +#if ENABLE_PCRE + if (number >= regex->re_matches) + return -1; + if (regex->re_match[number * 2] == -1) + return -1; + return regex->re_match[number * 2 + 1] - regex->re_match[number * 2]; +#elif ENABLE_PCRE2 + PCRE2_SIZE len; + int rc = pcre2_substring_length_bynumber(regex->re_match, number, &len); + return (!rc) ? len : -1; +#else + if (regex->re_match[number].rm_so == -1) + return -1; + return regex->re_match[number].rm_eo - regex->re_match[number].rm_so; +#endif +}