From 0a275d764a70fcb65f36dbfa80d0013a63e268f9 Mon Sep 17 00:00:00 2001 From: Jim Hague Date: Sat, 16 Dec 2017 20:59:16 +0000 Subject: [PATCH] eit: if PCRE/PCRE2 in use, regexes can be marked for Posix engine execution only (#4795) If fancier regex engines are available, we need to be able to mark regexes that should only ever be executed by the Posix engine, to make sure that they will always work as expected. If PCRE or PCRE2 is available, look for regexes specific to those. These have the same name, but are under a map named "pcre" or "pcre2". If they are not found, fall back to the top level Posix regexes, but make sure these are executed by the Posix engine. --- data/conf/epggrab/eit/scrape/README | 38 ++++- data/conf/epggrab/eit/scrape/uk | 12 +- src/epggrab/module/eit.c | 10 +- src/epggrab/module/eitpatternlist.c | 19 ++- src/epggrab/module/eitpatternlist.h | 6 +- src/epggrab/module/opentv.c | 10 +- src/tvhregex.h | 21 ++- src/wrappers.c | 248 ++++++++++++++++------------ support/eitscrape_test.py | 65 ++++---- 9 files changed, 254 insertions(+), 175 deletions(-) diff --git a/data/conf/epggrab/eit/scrape/README b/data/conf/epggrab/eit/scrape/README index 015a4fe24..51972c4d6 100644 --- a/data/conf/epggrab/eit/scrape/README +++ b/data/conf/epggrab/eit/scrape/README @@ -41,14 +41,36 @@ EPG summary is not changed. Regular expression engine ------------------------- -If the PCRE or PCRE2 library is found during configuration, that library -is used for regular expression matching. Otherwise, the default C library -POSIX regular expression handling is used, and the regular expressions -treated as extended POSIX regular expressions. - -If a regular expression is intended for universal use, you need to be careful -to ensure that it works as expected on PCRE and POSIX engines. A useful -reference is at http://www.regular-expressions.info/refbasic.html. +The top level regular expressions are POSIX regular expressions, and are +executed using the C runtime library regular expression functions. The +expressions are treated as extended POSIX regular expressions. + +If either the PCRE or the PCRE2 library are found during configuration, +that library can be used for regular expression matching. If both are +found, PCRE2 is used. + +Because there are differences in regular expression facilities and +handling between POSIX, PCRE and PCRE2, you must specify an alternate +regular expression appropriate for the engine. Alternate regular +expressions are held as members of a top-level object "pcre" or "pcre2". +For example: + +{ + "scrape_subtitle": [ + "^[.][.][.][^:.]*[.:] ([^.0-9][^:]*): ", + "^[0-9]+/[0-9]+[.] +([^:]*): ", + "^([^.0-9][^:]+): " + ], + "pcre": { + "scrape_subtitle": [ + "^(?:[.][.][.][^:.]*[.:] +|[0-9]+/[0-9]+[.] +)?([^.0-9][^:]*): " + ] + } +} + +A useful reference on the differences between POSIX, PCRE and PCRE2 +regular expressions is at +http://www.regular-expressions.info/refbasic.html. Testing ------- diff --git a/data/conf/epggrab/eit/scrape/uk b/data/conf/epggrab/eit/scrape/uk index f7b383db0..b412318a6 100644 --- a/data/conf/epggrab/eit/scrape/uk +++ b/data/conf/epggrab/eit/scrape/uk @@ -24,5 +24,15 @@ ], "is_new" : [ "^(New: )" - ] + ], + "pcre": { + "scrape_subtitle": [ + "^(?:[.][.][.][^:.]*[.:] +|[0-9]+/[0-9]+[.] +)?([^.0-9][^:]*): " + ] + }, + "pcre2": { + "scrape_subtitle": [ + "^(?:[.][.][.][^:.]*[.:] +|[0-9]+/[0-9]+[.] +)?([^.0-9][^:]*): " + ] + } } diff --git a/src/epggrab/module/eit.c b/src/epggrab/module/eit.c index eb967e019..d343c2f22 100644 --- a/src/epggrab/module/eit.c +++ b/src/epggrab/module/eit.c @@ -1193,14 +1193,14 @@ static void _eit_scrape_clear(eit_module_t *mod) static int _eit_scrape_load_one ( htsmsg_t *m, eit_module_t* mod ) { if (mod->scrape_episode) { - eit_pattern_compile_list(&mod->p_snum, htsmsg_get_list(m, "season_num")); - eit_pattern_compile_list(&mod->p_enum, htsmsg_get_list(m, "episode_num")); - eit_pattern_compile_list(&mod->p_airdate, htsmsg_get_list(m, "airdate")); - eit_pattern_compile_list(&mod->p_is_new, htsmsg_get_list(m, "is_new")); + eit_pattern_compile_named_list(&mod->p_snum, m, "season_num"); + eit_pattern_compile_named_list(&mod->p_enum, m, "episode_num"); + eit_pattern_compile_named_list(&mod->p_airdate, m, "airdate"); + eit_pattern_compile_named_list(&mod->p_is_new, m, "is_new"); } if (mod->scrape_subtitle) { - eit_pattern_compile_list(&mod->p_scrape_subtitle, htsmsg_get_list(m, "scrape_subtitle")); + eit_pattern_compile_named_list(&mod->p_scrape_subtitle, m, "scrape_subtitle"); } return 1; diff --git a/src/epggrab/module/eitpatternlist.c b/src/epggrab/module/eitpatternlist.c index b3998990d..787072a1b 100644 --- a/src/epggrab/module/eitpatternlist.c +++ b/src/epggrab/module/eitpatternlist.c @@ -22,7 +22,7 @@ #include "eitpatternlist.h" #include "htsmsg.h" -void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l ) +void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l, int flags ) { eit_pattern_t *pattern; htsmsg_field_t *f; @@ -35,7 +35,7 @@ void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l ) if (s == NULL) continue; pattern = calloc(1, sizeof(eit_pattern_t)); pattern->text = strdup(s); - if (regex_compile(&pattern->compiled, pattern->text, 0, LS_EPGGRAB)) { + if (regex_compile(&pattern->compiled, pattern->text, flags, LS_EPGGRAB)) { tvhwarn(LS_EPGGRAB, "error compiling pattern \"%s\"", pattern->text); free(pattern->text); free(pattern); @@ -46,6 +46,21 @@ void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l ) } } +void eit_pattern_compile_named_list ( eit_pattern_list_t *list, htsmsg_t *m, const char *key) +{ +#if defined(TVHREGEX_TYPE) + htsmsg_t *m_alt = htsmsg_get_map(m, TVHREGEX_TYPE); + if (m_alt) { + htsmsg_t *res = htsmsg_get_list(m_alt, key); + if (res) { + eit_pattern_compile_list(list, res, 0); + return; + } + } +#endif + eit_pattern_compile_list(list, htsmsg_get_list(m, key), TVHREGEX_POSIX); +} + void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l) { char *b[2] = { buf, NULL }; diff --git a/src/epggrab/module/eitpatternlist.h b/src/epggrab/module/eitpatternlist.h index c48912f27..a32d7b59a 100644 --- a/src/epggrab/module/eitpatternlist.h +++ b/src/epggrab/module/eitpatternlist.h @@ -25,7 +25,7 @@ typedef struct eit_pattern { char *text; - regex_t compiled; + tvh_regex_t compiled; TAILQ_ENTRY(eit_pattern) p_links; } eit_pattern_t; @@ -37,7 +37,9 @@ static inline int eit_pattern_list_empty ( eit_pattern_list_t *list ) { return TAILQ_EMPTY(list); } /* Compile a regular expression pattern from a message */ -void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l ); +void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l, int flags ); +/* Compile a regular expression pattern from a named message, applying message location conventions */ +void eit_pattern_compile_named_list ( eit_pattern_list_t *list, htsmsg_t *m, const char *key); /* Apply the compiled pattern to text. If it matches then return the * match in buf which is of size size_buf. * Return the buf or NULL if no match. diff --git a/src/epggrab/module/opentv.c b/src/epggrab/module/opentv.c index 8609f2752..82b54d32d 100644 --- a/src/epggrab/module/opentv.c +++ b/src/epggrab/module/opentv.c @@ -956,11 +956,11 @@ static int _opentv_prov_load_one ( const char *id, htsmsg_t *m ) mod->channel = _pid_list_to_array(cl); mod->title = _pid_list_to_array(tl); mod->summary = _pid_list_to_array(sl); - eit_pattern_compile_list(&mod->p_snum, htsmsg_get_list(m, "season_num")); - eit_pattern_compile_list(&mod->p_enum, htsmsg_get_list(m, "episode_num")); - eit_pattern_compile_list(&mod->p_pnum, htsmsg_get_list(m, "part_num")); - eit_pattern_compile_list(&mod->p_subt, htsmsg_get_list(m, "subtitle")); - eit_pattern_compile_list(&mod->p_cleanup_title, htsmsg_get_list(m, "cleanup_title")); + eit_pattern_compile_named_list(&mod->p_snum, m, "season_num"); + eit_pattern_compile_named_list(&mod->p_enum, m, "episode_num"); + eit_pattern_compile_named_list(&mod->p_pnum, m, "part_num"); + eit_pattern_compile_named_list(&mod->p_subt, m, "subtitle"); + eit_pattern_compile_named_list(&mod->p_cleanup_title, m, "cleanup_title"); return 1; } diff --git a/src/tvhregex.h b/src/tvhregex.h index 66cff4dc0..5c698eac3 100644 --- a/src/tvhregex.h +++ b/src/tvhregex.h @@ -26,21 +26,24 @@ # define PCRE_STUDY_JIT_COMPILE 0 # endif +#define TVHREGEX_TYPE "pcre" + #elif ENABLE_PCRE2 # define PCRE2_CODE_UNIT_WIDTH 8 # include -#else - -# include +#define TVHREGEX_TYPE "pcre2" #endif +# include + #define TVHREGEX_MAX_MATCHES 10 /* Compile flags */ -#define TVHREGEX_CASELESS 1 +#define TVHREGEX_POSIX 1 /* Use POSIX regex engine */ +#define TVHREGEX_CASELESS 2 /* Use case-insensitive matching */ typedef struct { #if ENABLE_PCRE @@ -57,10 +60,12 @@ typedef struct { pcre2_match_data *re_match; pcre2_match_context *re_mcontext; pcre2_jit_stack *re_jit_stack; -#else - regex_t re_code; - regmatch_t re_match[TVHREGEX_MAX_MATCHES]; - const char *re_text; +#endif + regex_t re_posix_code; + regmatch_t re_posix_match[TVHREGEX_MAX_MATCHES]; + const char *re_posix_text; +#if ENABLE_PCRE || ENABLE_PCRE2 + int is_posix; #endif } tvh_regex_t; diff --git a/src/wrappers.c b/src/wrappers.c index 52159dc3f..b20bac523 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -433,123 +433,143 @@ tvh_qsort_r(void *base, size_t nmemb, size_t size, int (*compar)(const void *, c */ void regex_free(tvh_regex_t *regex) { +#if ENABLE_PCRE || ENABLE_PCRE2 + if (regex->is_posix) { +#endif + regfree(®ex->re_posix_code); + regex->re_posix_text = NULL; +#if ENABLE_PCRE || ENABLE_PCRE2 + } else { #if ENABLE_PCRE #ifdef PCRE_CONFIG_JIT #if PCRE_STUDY_JIT_COMPILE - if (regex->re_jit_stack) { - pcre_jit_stack_free(regex->re_jit_stack); - regex->re_jit_stack = NULL; - } + if (regex->re_jit_stack) { + pcre_jit_stack_free(regex->re_jit_stack); + regex->re_jit_stack = NULL; + } #endif - pcre_free_study(regex->re_extra); + pcre_free_study(regex->re_extra); #else - pcre_free(regex->re_extra); + pcre_free(regex->re_extra); #endif - pcre_free(regex->re_code); - regex->re_extra = NULL; - regex->re_code = NULL; - regex->re_text = NULL; + pcre_free(regex->re_code); + regex->re_extra = NULL; + regex->re_code = NULL; + regex->re_text = NULL; #elif ENABLE_PCRE2 - pcre2_jit_stack_free(regex->re_jit_stack); - pcre2_match_data_free(regex->re_match); - pcre2_code_free(regex->re_code); - pcre2_match_context_free(regex->re_mcontext); - regex->re_match = NULL; - regex->re_code = NULL; - regex->re_mcontext = NULL; - regex->re_jit_stack = NULL; -#else - regfree(®ex->re_code); - regex->re_text = NULL; + pcre2_jit_stack_free(regex->re_jit_stack); + pcre2_match_data_free(regex->re_match); + pcre2_code_free(regex->re_code); + pcre2_match_context_free(regex->re_mcontext); + regex->re_match = NULL; + regex->re_code = NULL; + regex->re_mcontext = NULL; + regex->re_jit_stack = NULL; +#endif + } #endif } int regex_compile(tvh_regex_t *regex, const char *re_str, int flags, int subsys) { +#if ENABLE_PCRE || ENABLE_PCRE2 + regex->is_posix = 0; + if (flags & TVHREGEX_POSIX) { + regex->is_posix = 1; +#endif + int options = REG_EXTENDED; + if (flags & TVHREGEX_CASELESS) + options |= REG_ICASE; + if (!regcomp(®ex->re_posix_code, re_str, options)) + return 0; + tvherror(subsys, "Unable to compile regex '%s'", re_str); + return -1; +#if ENABLE_PCRE || ENABLE_PCRE2 + } else { #if ENABLE_PCRE - const char *estr; - int eoff; - int options = PCRE_UTF8; - if (flags & TVHREGEX_CASELESS) - options |= PCRE_CASELESS; + const char *estr; + int eoff; + int options = PCRE_UTF8; + if (flags & TVHREGEX_CASELESS) + options |= PCRE_CASELESS; #if PCRE_STUDY_JIT_COMPILE - regex->re_jit_stack = NULL; + regex->re_jit_stack = NULL; #endif - regex->re_extra = NULL; - regex->re_code = pcre_compile(re_str, options, &estr, &eoff, NULL); - if (regex->re_code == NULL) { - tvherror(subsys, "Unable to compile PCRE '%s': %s", re_str, estr); - } else { - regex->re_extra = pcre_study(regex->re_code, - PCRE_STUDY_JIT_COMPILE, &estr); - if (regex->re_extra == NULL && estr) - tvherror(subsys, "Unable to study PCRE '%s': %s", re_str, estr); - else { + regex->re_extra = NULL; + regex->re_code = pcre_compile(re_str, options, &estr, &eoff, NULL); + if (regex->re_code == NULL) { + tvherror(subsys, "Unable to compile PCRE '%s': %s", re_str, estr); + } else { + regex->re_extra = pcre_study(regex->re_code, + PCRE_STUDY_JIT_COMPILE, &estr); + if (regex->re_extra == NULL && estr) + tvherror(subsys, "Unable to study PCRE '%s': %s", re_str, estr); + else { #if PCRE_STUDY_JIT_COMPILE - regex->re_jit_stack = pcre_jit_stack_alloc(32*1024, 512*1024); - if (regex->re_jit_stack) - pcre_assign_jit_stack(regex->re_extra, NULL, regex->re_jit_stack); + regex->re_jit_stack = pcre_jit_stack_alloc(32*1024, 512*1024); + if (regex->re_jit_stack) + pcre_assign_jit_stack(regex->re_extra, NULL, regex->re_jit_stack); #endif - return 0; + return 0; + } } - } - return -1; + return -1; #elif ENABLE_PCRE2 - PCRE2_UCHAR8 ebuf[128]; - int ecode; - PCRE2_SIZE eoff; - size_t jsz; - uint32_t options; - assert(regex->re_jit_stack == NULL); - regex->re_jit_stack = NULL; - regex->re_match = NULL; - regex->re_mcontext = pcre2_match_context_create(NULL); - options = PCRE2_UTF; - if (flags & TVHREGEX_CASELESS) - options |= PCRE2_CASELESS; - regex->re_code = pcre2_compile((PCRE2_SPTR8)re_str, -1, options, - &ecode, &eoff, NULL); - if (regex->re_code == NULL) { - (void)pcre2_get_error_message(ecode, ebuf, 120); - tvherror(subsys, "Unable to compile PCRE2 '%s': %s", re_str, ebuf); - } else { - regex->re_match = pcre2_match_data_create(TVHREGEX_MAX_MATCHES, NULL); - if (re_str[0] && pcre2_jit_compile(regex->re_code, PCRE2_JIT_COMPLETE) >= 0) { - jsz = 0; - if (pcre2_pattern_info(regex->re_code, PCRE2_INFO_JITSIZE, &jsz) >= 0 && jsz > 0) { - regex->re_jit_stack = pcre2_jit_stack_create(32 * 1024, 512 * 1024, NULL); - if (regex->re_jit_stack) - pcre2_jit_stack_assign(regex->re_mcontext, NULL, regex->re_jit_stack); + PCRE2_UCHAR8 ebuf[128]; + int ecode; + PCRE2_SIZE eoff; + size_t jsz; + uint32_t options; + assert(regex->re_jit_stack == NULL); + regex->re_jit_stack = NULL; + regex->re_match = NULL; + regex->re_mcontext = pcre2_match_context_create(NULL); + options = PCRE2_UTF; + if (flags & TVHREGEX_CASELESS) + options |= PCRE2_CASELESS; + regex->re_code = pcre2_compile((PCRE2_SPTR8)re_str, -1, options, + &ecode, &eoff, NULL); + if (regex->re_code == NULL) { + (void)pcre2_get_error_message(ecode, ebuf, 120); + tvherror(subsys, "Unable to compile PCRE2 '%s': %s", re_str, ebuf); + } else { + regex->re_match = pcre2_match_data_create(TVHREGEX_MAX_MATCHES, NULL); + if (re_str[0] && pcre2_jit_compile(regex->re_code, PCRE2_JIT_COMPLETE) >= 0) { + jsz = 0; + if (pcre2_pattern_info(regex->re_code, PCRE2_INFO_JITSIZE, &jsz) >= 0 && jsz > 0) { + regex->re_jit_stack = pcre2_jit_stack_create(32 * 1024, 512 * 1024, NULL); + if (regex->re_jit_stack) + pcre2_jit_stack_assign(regex->re_mcontext, NULL, regex->re_jit_stack); + } } + return 0; } - return 0; + return -1; +#endif } - return -1; -#else - int options = REG_EXTENDED; - if (flags & TVHREGEX_CASELESS) - options |= REG_ICASE; - if (!regcomp(®ex->re_code, re_str, options)) - return 0; - tvherror(subsys, "Unable to compile regex '%s'", re_str); - return -1; #endif } int regex_match(tvh_regex_t *regex, const char *str) { +#if ENABLE_PCRE || ENABLE_PCRE2 + if (regex->is_posix) { +#endif + regex->re_posix_text = str; + return regexec(®ex->re_posix_code, str, TVHREGEX_MAX_MATCHES, regex->re_posix_match, 0); +#if ENABLE_PCRE || ENABLE_PCRE2 + } else { #if ENABLE_PCRE - regex->re_text = str; - regex->re_matches = - pcre_exec(regex->re_code, regex->re_extra, - str, strlen(str), 0, 0, regex->re_match, TVHREGEX_MAX_MATCHES * 3); - return regex->re_matches < 0; + regex->re_text = str; + regex->re_matches = + pcre_exec(regex->re_code, regex->re_extra, + str, strlen(str), 0, 0, regex->re_match, TVHREGEX_MAX_MATCHES * 3); + return regex->re_matches < 0; #elif ENABLE_PCRE2 - return pcre2_match(regex->re_code, (PCRE2_SPTR8)str, -1, 0, 0, - regex->re_match, regex->re_mcontext) <= 0; -#else - regex->re_text = str; - return regexec(®ex->re_code, str, TVHREGEX_MAX_MATCHES, regex->re_match, 0); + return pcre2_match(regex->re_code, (PCRE2_SPTR8)str, -1, 0, 0, + regex->re_match, regex->re_mcontext) <= 0; +#endif + } #endif } @@ -558,24 +578,30 @@ int regex_match_substring(tvh_regex_t *regex, unsigned number, char *buf, size_t assert(buf); if (number >= TVHREGEX_MAX_MATCHES) return -2; +#if ENABLE_PCRE || ENABLE_PCRE2 + if (regex->is_posix) { +#endif + if (regex->re_posix_match[number].rm_so == -1) + return -1; + ssize_t size = regex->re_posix_match[number].rm_eo - regex->re_posix_match[number].rm_so; + if (size < 0 || size > (size_buf - 1)) + return -1; + memcpy(buf, regex->re_posix_text + regex->re_posix_match[number].rm_so, size); + buf[size] = '\0'; + return 0; +#if ENABLE_PCRE || ENABLE_PCRE2 + } else { #if ENABLE_PCRE - return pcre_copy_substring(regex->re_text, regex->re_match, - (regex->re_matches == 0) - ? TVHREGEX_MAX_MATCHES - : regex->re_matches, - number, buf, size_buf) < 0; + return pcre_copy_substring(regex->re_text, regex->re_match, + (regex->re_matches == 0) + ? TVHREGEX_MAX_MATCHES + : regex->re_matches, + number, buf, size_buf) < 0; #elif ENABLE_PCRE2 - PCRE2_SIZE psiz = size_buf; - return pcre2_substring_copy_bynumber(regex->re_match, number, (PCRE2_UCHAR8*)buf, &psiz); -#else - if (regex->re_match[number].rm_so == -1) - return -1; - ssize_t size = regex->re_match[number].rm_eo - regex->re_match[number].rm_so; - if (size < 0 || size > (size_buf - 1)) - return -1; - memcpy(buf, regex->re_text + regex->re_match[number].rm_so, size); - buf[size] = '\0'; - return 0; + PCRE2_SIZE psiz = size_buf; + return pcre2_substring_copy_bynumber(regex->re_match, number, (PCRE2_UCHAR8*)buf, &psiz); +#endif + } #endif } @@ -583,6 +609,14 @@ int regex_match_substring_length(tvh_regex_t *regex, unsigned number) { if (number >= TVHREGEX_MAX_MATCHES) return -2; +#if ENABLE_PCRE || ENABLE_PCRE2 + if (regex->is_posix) { +#endif + if (regex->re_posix_match[number].rm_so == -1) + return -1; + return regex->re_posix_match[number].rm_eo - regex->re_posix_match[number].rm_so; +#if ENABLE_PCRE || ENABLE_PCRE2 + } else { #if ENABLE_PCRE if (number >= regex->re_matches) return -1; @@ -593,9 +627,7 @@ int regex_match_substring_length(tvh_regex_t *regex, unsigned number) PCRE2_SIZE len; int rc = pcre2_substring_length_bynumber(regex->re_match, number, &len); return (!rc) ? len : -1; -#else - if (regex->re_match[number].rm_so == -1) - return -1; - return regex->re_match[number].rm_eo - regex->re_match[number].rm_so; +#endif + } #endif } diff --git a/support/eitscrape_test.py b/support/eitscrape_test.py index 7c68e2511..11ceb66cf 100755 --- a/support/eitscrape_test.py +++ b/support/eitscrape_test.py @@ -52,7 +52,7 @@ import os, sys import pprint import json import re - +import argparse class EITScrapeTest(object): def __init__(self): @@ -119,48 +119,41 @@ class EITScrapeTest(object): if test.has_key('new_summary'): self.run_test_case_i(text, subtitle_reg, test['new_summary'], "new_summary", match=2) - +def get_regs(parser, engine, key): + try: + l = parser[engine][key] + except KeyError: + l = parser[key] + res = [] + for reg in l: + res.append(re.compile(reg)) + return res def main(argv): - if len(argv) < 3: - sys.exit('Usage: %s scrapperfile scrappertestfile' % argv[0]) - - if not os.path.exists(argv[1]): - sys.exit('ERROR: scrapperfile "%s" was not found!' % argv[1]) - if not os.path.exists(sys.argv[2]): - sys.exit('ERROR: scrappertestfile "%s" was not found!' % argv[2]) - - print "Opening Parser file " + argv[1] - fp = open(argv[1], 'r') - parser = json.load(fp) + parser = argparse.ArgumentParser(description='Test scraper regular expressions') + group = parser.add_mutually_exclusive_group() + group.add_argument('--pcre', dest='engine', + action='store_const', const='pcre', + help='test PCRE regular expressions if available') + group.add_argument('--pcre2', dest='engine', + action='store_const', const='pcre2', + help='test PCRE2 regular expressions if available') + parser.add_argument('scraperfile', type=argparse.FileType('r')) + parser.add_argument('scrapertestfile', type=argparse.FileType('r')) + args = parser.parse_args() + + print(args.engine) + parser = json.load(args.scraperfile) pprint.pprint(parser, indent=2) # Compile the regular expressions that we will use. - sn_reg = [] - if parser.has_key('season_num'): - sn = parser['season_num'] - for reg in sn: sn_reg.append(re.compile(reg)) - - en_reg = [] - if parser.has_key('episode_num'): - en = parser['episode_num'] - for reg in en: en_reg.append(re.compile(reg)) - - airdate_reg = [] - if parser.has_key('airdate'): - airdate = parser['airdate'] - for reg in airdate: airdate_reg.append(re.compile(reg)) - - subtitle_reg = [] - if parser.has_key('scrape_subtitle'): - subtitle = parser['scrape_subtitle'] - for reg in subtitle: - subtitle_reg.append(re.compile(reg)) + sn_reg = get_regs(parser, args.engine, 'season_num') + en_reg = get_regs(parser, args.engine, 'episode_num') + airdate_reg = get_regs(parser, args.engine, 'airdate') + subtitle_reg = get_regs(parser, args.engine, 'scrape_subtitle') # Now parse the test file which is a JSON input file - print "Opening test input file " + argv[2] - fp = open(argv[2], 'r') - tests = json.load(fp) + tests = json.load(args.scrapertestfile) # And run the tests tester = EITScrapeTest() -- 2.47.3