eit: rework EIT scraper test script and add POSIX matching (#4801)

author Jim Hague <jim@sinodun.com>

Thu, 21 Dec 2017 10:47:22 +0000 (10:47 +0000)

committer Jaroslav Kysela <perex@perex.cz>

Thu, 21 Dec 2017 17:01:20 +0000 (18:01 +0100)
author Jim Hague <jim@sinodun.com>
Thu, 21 Dec 2017 10:47:22 +0000 (10:47 +0000)
committer Jaroslav Kysela <perex@perex.cz>
Thu, 21 Dec 2017 17:01:20 +0000 (18:01 +0100)
diff --git a/support/eitscrape_test.py b/support/eitscrape_test.py

index a38d0ef94384fe282068bfd29a0eefdf11121bf3..af886b90111a982c402262a55122c8a0263e3d42 100755 (executable)
--- a/support/eitscrape_test.py
+++ b/support/eitscrape_test.py
@@ -51,82 +51,123 @@ error:
  import os, sys
  import pprint
  import json
-import re
  import argparse
  
+try:
+  import regex as re
+  re_base_flag = re.VERSION1
+  re_posix_flag = re.POSIX
+except ImportError:
+  import re
+  re_base_flag = re_posix_flag = 0
+
+class Regex(object):
+  def __init__(self, engine, regex):
+    self.engine = engine
+    self.regex = regex
+    flags = re_base_flag
+    if not engine:
+      flags |= re_posix_flag
+    self.regcomp = re.compile(regex, flags)
+
+  def search(self, text):
+    match = self.regcomp.search(text)
+    res = None
+    if match:
+      i = 1
+      res = ""
+      while True:
+        try:
+          g = match.group(i)
+        except IndexError:
+          break
+        if g:
+          res = res + g
+        i += 1
+    return res
+
+  def text(self):
+    return self.regex
+
  class EITScrapeTest(object):
    def __init__(self):
      self.num_failed = 0;
      self.num_ok = 0;
  
-  def run_test_case_i(self, text, reg, expect, testing, match=1):
+  def run_test_case_i(self, text, regexes, expect, testing):
      """Run a test case for text using the regular expression lists in reg,
      expecting the result of a match to be expect while running a test
      case for the string testing."""
-    for iter in reg:
-      m = iter.search(text)
-      if (m is not None):
-        result = m.group(match)
+    for regex in regexes:
+      result = regex.search(text)
+      if result is not None:
          if result == expect:
-          print 'OK: Got correct result of "%s" testing "%s" for "%s" using "%s"' % (result, testing, text, iter.pattern)
+          print 'OK: Got correct result of "{result}" testing "{testing}" for "{text}" using "{pattern}"'.format(result=result, testing=testing, text=text, pattern=regex.text())
            self.num_ok = self.num_ok + 1
-          return 1
          else:
-          print 'FAIL: Got incorrect result of "%s" expecting "%s" testing "%s" for "%s" using "%s"' % (result, expect, testing, text, iter.pattern)
+          print 'FAIL: Got incorrect result of "{result}" expecting "{expect}" testing "{testing}" for "{text}" using "{pattern}"'.format(result=result, expect=expect, testing=testing, text=text, pattern=regex.text())
            self.num_failed = self.num_failed + 1
-          return 0
+        return
  
      if expect is None:
-      print 'OK: Got correct result of "<none>" testing "%s" for "%s"' % (testing, text)
+      print 'OK: Got correct result of "<none>" testing "{testing}" for "{text}"'.format(testing=testing, text=text)
        self.num_ok = self.num_ok + 1
-      return 1
      else:
-      extra = ""
-      if reg is None or len(reg) == 0:
-        extra = "(No regex loaded to parse this)"
-      print 'FAIL: No match for "%s" while testing "%s" and expecting "%s" %s' % (text, testing, expect, extra)
+      print 'FAIL: No match in "{text}" while testing "{testing}" and expecting "{expect}"'.format(text=text, testing=testing, expect=expect)
        self.num_failed = self.num_failed + 1
-      return 0
  
+  def run_test_case(self, engine, test, regexes):
+    """run a single test case.
+
+    regexes is a dictionary of compiled regexes named by the canonical name
+    of the test result.
  
-  def run_test_case(self, test, sn_reg, en_reg, airdate_reg, subtitle_reg):
-    """sn_reg List of season regular expression extractors.
-    en_reg List of episode number extractors.
-    airdate_reg List of airdate extractors.
-    subtitle_reg List of subtitle extractors.
+    The canonical name may, in the test data, be suffixed by the name of
+    the engine:, e.g. new_subtitle:pcre2.
    """
+    keys_to_test = set()
      for key in test.keys():
-      if key in ('age', 'genre'):
-        print 'Test case contains key "%s" which is not currently tested for "%s"' % (key, test)
-
-      if key not in ('age', 'airdate', 'comment', 'episode', 'genre', 'new_subtitle', 'new_summary', 'season', 'summary'):
-        print 'Test case contains invalid key "%s" (possible typo) for "%s"' % (key, test)
-        raise SyntaxWarning('Test case contains invalid/unknown key "%s" (possible typo) for "%s"' % (key, test))
-
-    # We are currently only testing against the summary field in the data
-    # as the input from the EIT.
-    text = test['summary']
-
-    # We have to use "has_key" since sometimes our valid result is "None"
-    if test.has_key('season'):
-      self.run_test_case_i(text, sn_reg, test['season'], "season")
-    if test.has_key('episode'):
-      self.run_test_case_i(text, en_reg, test['episode'], "episode")
-    if test.has_key('airdate'):
-      self.run_test_case_i(text, airdate_reg, test['airdate'], "airdate")
-    if test.has_key('new_subtitle'):
-      self.run_test_case_i(text, subtitle_reg, test['new_subtitle'], "new_subtitle")
-    if test.has_key('new_summary'):
-      self.run_test_case_i(text, subtitle_reg, test['new_summary'], "new_summary", match=2)
+      canonical, _, for_engine = key.partition(':')
+      if for_engine and for_engine != engine:
+        continue
+      if canonical in ('comment', 'summary', 'title'):
+        continue
+      if canonical in ('age', 'genre'):
+        print 'Test case contains key "{key}" which is not currently tested for "{test}"'.format(key=key, test=test)
+        continue
+      if canonical not in regexes:
+        print 'Test case contains invalid key "{key}" (possible typo) for "{test}"'.format(key=key, test=test)
+        raise SyntaxWarning('Test case contains invalid/unknown key {}'.format(key))
+      if for_engine:
+        keys_to_test.discard(canonical)
+        keys_to_test.add(key)
+      else:
+        if not engine or key + ':' + engine not in keys_to_test:
+          keys_to_test.add(key)
+
+    for key in keys_to_test:
+      canonical, _, _ = key.partition(':')
+      text = test['summary']
+      if canonical == 'new_title':
+        text = test['title'] + ' ' + text
+      if regexes[canonical]:
+        self.run_test_case_i(text, regexes[canonical], test[key], key)
+      else:
+        print 'FAIL: no regex defined for key "{key}"'.format(key=canonical)
+        self.num_failed = self.num_failed + 1
+
  
  def get_regs(parser, engine, key):
    try:
      l = parser[engine][key]
    except KeyError:
-    l = parser[key]
+    try:
+      l = parser[key]
+    except KeyError:
+      return None
    res = []
    for reg in l:
-    res.append(re.compile(reg))
+    res.append(Regex(engine, reg))
    return res
  
  def main(argv):
@@ -150,10 +191,13 @@ def main(argv):
    pprint.pprint(parser, indent=2)
  
    # Compile the regular expressions that we will use.
-  sn_reg = get_regs(parser, args.engine, 'season_num')
-  en_reg = get_regs(parser, args.engine, 'episode_num')
-  airdate_reg = get_regs(parser, args.engine, 'airdate')
-  subtitle_reg = get_regs(parser, args.engine, 'scrape_subtitle')
+  regexes = {}
+  regexes["season"] = get_regs(parser, args.engine, 'season_num')
+  regexes["episode"] = get_regs(parser, args.engine, 'episode_num')
+  regexes["airdate"] = get_regs(parser, args.engine, 'airdate')
+  regexes["new_title"] = get_regs(parser, args.engine, 'scrape_title')
+  regexes["new_subtitle"] = get_regs(parser, args.engine, 'scrape_subtitle')
+  regexes["new_summary"] = get_regs(parser, args.engine, 'scrape_summary')
  
    # Now parse the test file which is a JSON input file
    tests = json.load(args.scrapertestfile)
@@ -164,7 +208,7 @@ def main(argv):
    for test in tests['tests']:
      print "Running test" + str(test)
      pprint.pprint(test)
-    tester.run_test_case(test, sn_reg, en_reg, airdate_reg, subtitle_reg)
+    tester.run_test_case(args.engine, test, regexes)
  
    # And show the results
    print "\n\nSummary:\tNumOK: %s\tNumFailed: %s" %(tester.num_ok, tester.num_failed)
diff --git a/support/testdata/eitscrape/README b/support/testdata/eitscrape/README

index c95e36ed5b8ac20124a6b93e2ed3696beb54b386..bc3bd0aea227039908158ee63d690116e56fbec7 100644 (file)
--- a/support/testdata/eitscrape/README
+++ b/support/testdata/eitscrape/README
@@ -8,18 +8,26 @@ It then outputs OK or FAIL for every test case.
  The format is JSON.
  
  Input:
-- "comment' - explanation for the test if there is something unusual
+- "comment" - explanation for the test if there is something unusual
        about it such as broadcaster reversing episode and season.
  
+- "title" - title field from EIT broadcast that will be scraped.
+
  - "summary" - summary field from EIT broadcast that will be scraped.
  
  
  Expected Result:
  
-- 'age', 'airdate', 'comment', 'episode', 'genre', 'season':
+- 'age', 'airdate', 'comment', 'episode', 'genre', 'season',
+  'new_title', 'new_subtitle', 'new_summary':
     expected scraped values. "null" is an accepted value such as '"season": null'
     for cases that should explicitly not set season.
  
     Currently we do not scrape age (12, 15, 18 etc) or genre but it is
     included since some broadcasters have this information in the
     summary and it may be supported in the future.
+
+- if different scraper values are expected for different regex engines,
+  give expected scraper values for a particular regex engine ('pcre',
+  'pcre1', 'pcre2') by appending a colon and the name of the engine
+  to the field name, e.g. 'new_title:pcre'.
author	Jim Hague <jim@sinodun.com>
	Thu, 21 Dec 2017 10:47:22 +0000 (10:47 +0000)
committer	Jaroslav Kysela <perex@perex.cz>
	Thu, 21 Dec 2017 17:01:20 +0000 (18:01 +0100)
support/eitscrape_test.py		patch \| blob \| blame \| history
support/testdata/eitscrape/README		patch \| blob \| blame \| history