import os, sys
import pprint
import json
-import re
import argparse
+try:
+ import regex as re
+ re_base_flag = re.VERSION1
+ re_posix_flag = re.POSIX
+except ImportError:
+ import re
+ re_base_flag = re_posix_flag = 0
+
+class Regex(object):
+ def __init__(self, engine, regex):
+ self.engine = engine
+ self.regex = regex
+ flags = re_base_flag
+ if not engine:
+ flags |= re_posix_flag
+ self.regcomp = re.compile(regex, flags)
+
+ def search(self, text):
+ match = self.regcomp.search(text)
+ res = None
+ if match:
+ i = 1
+ res = ""
+ while True:
+ try:
+ g = match.group(i)
+ except IndexError:
+ break
+ if g:
+ res = res + g
+ i += 1
+ return res
+
+ def text(self):
+ return self.regex
+
class EITScrapeTest(object):
def __init__(self):
self.num_failed = 0;
self.num_ok = 0;
- def run_test_case_i(self, text, reg, expect, testing, match=1):
+ def run_test_case_i(self, text, regexes, expect, testing):
"""Run a test case for text using the regular expression lists in reg,
expecting the result of a match to be expect while running a test
case for the string testing."""
- for iter in reg:
- m = iter.search(text)
- if (m is not None):
- result = m.group(match)
+ for regex in regexes:
+ result = regex.search(text)
+ if result is not None:
if result == expect:
- print 'OK: Got correct result of "%s" testing "%s" for "%s" using "%s"' % (result, testing, text, iter.pattern)
+ print 'OK: Got correct result of "{result}" testing "{testing}" for "{text}" using "{pattern}"'.format(result=result, testing=testing, text=text, pattern=regex.text())
self.num_ok = self.num_ok + 1
- return 1
else:
- print 'FAIL: Got incorrect result of "%s" expecting "%s" testing "%s" for "%s" using "%s"' % (result, expect, testing, text, iter.pattern)
+ print 'FAIL: Got incorrect result of "{result}" expecting "{expect}" testing "{testing}" for "{text}" using "{pattern}"'.format(result=result, expect=expect, testing=testing, text=text, pattern=regex.text())
self.num_failed = self.num_failed + 1
- return 0
+ return
if expect is None:
- print 'OK: Got correct result of "<none>" testing "%s" for "%s"' % (testing, text)
+ print 'OK: Got correct result of "<none>" testing "{testing}" for "{text}"'.format(testing=testing, text=text)
self.num_ok = self.num_ok + 1
- return 1
else:
- extra = ""
- if reg is None or len(reg) == 0:
- extra = "(No regex loaded to parse this)"
- print 'FAIL: No match for "%s" while testing "%s" and expecting "%s" %s' % (text, testing, expect, extra)
+ print 'FAIL: No match in "{text}" while testing "{testing}" and expecting "{expect}"'.format(text=text, testing=testing, expect=expect)
self.num_failed = self.num_failed + 1
- return 0
+ def run_test_case(self, engine, test, regexes):
+ """run a single test case.
+
+ regexes is a dictionary of compiled regexes named by the canonical name
+ of the test result.
- def run_test_case(self, test, sn_reg, en_reg, airdate_reg, subtitle_reg):
- """sn_reg List of season regular expression extractors.
- en_reg List of episode number extractors.
- airdate_reg List of airdate extractors.
- subtitle_reg List of subtitle extractors.
+ The canonical name may, in the test data, be suffixed by the name of
+ the engine:, e.g. new_subtitle:pcre2.
"""
+ keys_to_test = set()
for key in test.keys():
- if key in ('age', 'genre'):
- print 'Test case contains key "%s" which is not currently tested for "%s"' % (key, test)
-
- if key not in ('age', 'airdate', 'comment', 'episode', 'genre', 'new_subtitle', 'new_summary', 'season', 'summary'):
- print 'Test case contains invalid key "%s" (possible typo) for "%s"' % (key, test)
- raise SyntaxWarning('Test case contains invalid/unknown key "%s" (possible typo) for "%s"' % (key, test))
-
- # We are currently only testing against the summary field in the data
- # as the input from the EIT.
- text = test['summary']
-
- # We have to use "has_key" since sometimes our valid result is "None"
- if test.has_key('season'):
- self.run_test_case_i(text, sn_reg, test['season'], "season")
- if test.has_key('episode'):
- self.run_test_case_i(text, en_reg, test['episode'], "episode")
- if test.has_key('airdate'):
- self.run_test_case_i(text, airdate_reg, test['airdate'], "airdate")
- if test.has_key('new_subtitle'):
- self.run_test_case_i(text, subtitle_reg, test['new_subtitle'], "new_subtitle")
- if test.has_key('new_summary'):
- self.run_test_case_i(text, subtitle_reg, test['new_summary'], "new_summary", match=2)
+ canonical, _, for_engine = key.partition(':')
+ if for_engine and for_engine != engine:
+ continue
+ if canonical in ('comment', 'summary', 'title'):
+ continue
+ if canonical in ('age', 'genre'):
+ print 'Test case contains key "{key}" which is not currently tested for "{test}"'.format(key=key, test=test)
+ continue
+ if canonical not in regexes:
+ print 'Test case contains invalid key "{key}" (possible typo) for "{test}"'.format(key=key, test=test)
+ raise SyntaxWarning('Test case contains invalid/unknown key {}'.format(key))
+ if for_engine:
+ keys_to_test.discard(canonical)
+ keys_to_test.add(key)
+ else:
+ if not engine or key + ':' + engine not in keys_to_test:
+ keys_to_test.add(key)
+
+ for key in keys_to_test:
+ canonical, _, _ = key.partition(':')
+ text = test['summary']
+ if canonical == 'new_title':
+ text = test['title'] + ' ' + text
+ if regexes[canonical]:
+ self.run_test_case_i(text, regexes[canonical], test[key], key)
+ else:
+ print 'FAIL: no regex defined for key "{key}"'.format(key=canonical)
+ self.num_failed = self.num_failed + 1
+
def get_regs(parser, engine, key):
try:
l = parser[engine][key]
except KeyError:
- l = parser[key]
+ try:
+ l = parser[key]
+ except KeyError:
+ return None
res = []
for reg in l:
- res.append(re.compile(reg))
+ res.append(Regex(engine, reg))
return res
def main(argv):
pprint.pprint(parser, indent=2)
# Compile the regular expressions that we will use.
- sn_reg = get_regs(parser, args.engine, 'season_num')
- en_reg = get_regs(parser, args.engine, 'episode_num')
- airdate_reg = get_regs(parser, args.engine, 'airdate')
- subtitle_reg = get_regs(parser, args.engine, 'scrape_subtitle')
+ regexes = {}
+ regexes["season"] = get_regs(parser, args.engine, 'season_num')
+ regexes["episode"] = get_regs(parser, args.engine, 'episode_num')
+ regexes["airdate"] = get_regs(parser, args.engine, 'airdate')
+ regexes["new_title"] = get_regs(parser, args.engine, 'scrape_title')
+ regexes["new_subtitle"] = get_regs(parser, args.engine, 'scrape_subtitle')
+ regexes["new_summary"] = get_regs(parser, args.engine, 'scrape_summary')
# Now parse the test file which is a JSON input file
tests = json.load(args.scrapertestfile)
for test in tests['tests']:
print "Running test" + str(test)
pprint.pprint(test)
- tester.run_test_case(test, sn_reg, en_reg, airdate_reg, subtitle_reg)
+ tester.run_test_case(args.engine, test, regexes)
# And show the results
print "\n\nSummary:\tNumOK: %s\tNumFailed: %s" %(tester.num_ok, tester.num_failed)