From: Jim Hague <jim@sinodun.com>
Date: Thu, 21 Dec 2017 10:47:22 +0000 (+0000)
Subject: eit: rework EIT scraper test script and add POSIX matching (#4801)
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e986dffe36ac4c1dc536a2a6475ade1e3502e03b;p=thirdparty%2Ftvheadend.git

eit: rework EIT scraper test script and add POSIX matching (#4801)

Add support for new_title and new_summary test fields, and make adding new fields easier in the future.

Rework regex handling to carry regexp engine type info with the regex. The the PyPi package 'regex' is available, then use that and set its POSIX flag when evaluating POSIX regexes. This doesn't restrict the regex to POSIX-only expressions, but does do POSIX-style leftmost-longest matching, which is the significant behaviour different between PCRE and POSIX expressions.

Issue: #4801
---

diff --git a/support/eitscrape_test.py b/support/eitscrape_test.py
index a38d0ef94..af886b901 100755
--- a/support/eitscrape_test.py
+++ b/support/eitscrape_test.py
@@ -51,82 +51,123 @@ error:
 import os, sys
 import pprint
 import json
-import re
 import argparse
 
+try:
+  import regex as re
+  re_base_flag = re.VERSION1
+  re_posix_flag = re.POSIX
+except ImportError:
+  import re
+  re_base_flag = re_posix_flag = 0
+
+class Regex(object):
+  def __init__(self, engine, regex):
+    self.engine = engine
+    self.regex = regex
+    flags = re_base_flag
+    if not engine:
+      flags |= re_posix_flag
+    self.regcomp = re.compile(regex, flags)
+
+  def search(self, text):
+    match = self.regcomp.search(text)
+    res = None
+    if match:
+      i = 1
+      res = ""
+      while True:
+        try:
+          g = match.group(i)
+        except IndexError:
+          break
+        if g:
+          res = res + g
+        i += 1
+    return res
+
+  def text(self):
+    return self.regex
+
 class EITScrapeTest(object):
   def __init__(self):
     self.num_failed = 0;
     self.num_ok = 0;
 
-  def run_test_case_i(self, text, reg, expect, testing, match=1):
+  def run_test_case_i(self, text, regexes, expect, testing):
     """Run a test case for text using the regular expression lists in reg,
     expecting the result of a match to be expect while running a test
     case for the string testing."""
-    for iter in reg:
-      m = iter.search(text)
-      if (m is not None):
-        result = m.group(match)
+    for regex in regexes:
+      result = regex.search(text)
+      if result is not None:
         if result == expect:
-          print 'OK: Got correct result of "%s" testing "%s" for "%s" using "%s"' % (result, testing, text, iter.pattern)
+          print 'OK: Got correct result of "{result}" testing "{testing}" for "{text}" using "{pattern}"'.format(result=result, testing=testing, text=text, pattern=regex.text())
           self.num_ok = self.num_ok + 1
-          return 1
         else:
-          print 'FAIL: Got incorrect result of "%s" expecting "%s" testing "%s" for "%s" using "%s"' % (result, expect, testing, text, iter.pattern)
+          print 'FAIL: Got incorrect result of "{result}" expecting "{expect}" testing "{testing}" for "{text}" using "{pattern}"'.format(result=result, expect=expect, testing=testing, text=text, pattern=regex.text())
           self.num_failed = self.num_failed + 1
-          return 0
+        return
 
     if expect is None:
-      print 'OK: Got correct result of "<none>" testing "%s" for "%s"' % (testing, text)
+      print 'OK: Got correct result of "<none>" testing "{testing}" for "{text}"'.format(testing=testing, text=text)
       self.num_ok = self.num_ok + 1
-      return 1
     else:
-      extra = ""
-      if reg is None or len(reg) == 0:
-        extra = "(No regex loaded to parse this)"
-      print 'FAIL: No match for "%s" while testing "%s" and expecting "%s" %s' % (text, testing, expect, extra)
+      print 'FAIL: No match in "{text}" while testing "{testing}" and expecting "{expect}"'.format(text=text, testing=testing, expect=expect)
       self.num_failed = self.num_failed + 1
-      return 0
 
+  def run_test_case(self, engine, test, regexes):
+    """run a single test case.
+
+    regexes is a dictionary of compiled regexes named by the canonical name
+    of the test result.
 
-  def run_test_case(self, test, sn_reg, en_reg, airdate_reg, subtitle_reg):
-    """sn_reg List of season regular expression extractors.
-    en_reg List of episode number extractors.
-    airdate_reg List of airdate extractors.
-    subtitle_reg List of subtitle extractors.
+    The canonical name may, in the test data, be suffixed by the name of
+    the engine:, e.g. new_subtitle:pcre2.
   """
+    keys_to_test = set()
     for key in test.keys():
-      if key in ('age', 'genre'):
-        print 'Test case contains key "%s" which is not currently tested for "%s"' % (key, test)
-
-      if key not in ('age', 'airdate', 'comment', 'episode', 'genre', 'new_subtitle', 'new_summary', 'season', 'summary'):
-        print 'Test case contains invalid key "%s" (possible typo) for "%s"' % (key, test)
-        raise SyntaxWarning('Test case contains invalid/unknown key "%s" (possible typo) for "%s"' % (key, test))
-
-    # We are currently only testing against the summary field in the data
-    # as the input from the EIT.
-    text = test['summary']
-
-    # We have to use "has_key" since sometimes our valid result is "None"
-    if test.has_key('season'):
-      self.run_test_case_i(text, sn_reg, test['season'], "season")
-    if test.has_key('episode'):
-      self.run_test_case_i(text, en_reg, test['episode'], "episode")
-    if test.has_key('airdate'):
-      self.run_test_case_i(text, airdate_reg, test['airdate'], "airdate")
-    if test.has_key('new_subtitle'):
-      self.run_test_case_i(text, subtitle_reg, test['new_subtitle'], "new_subtitle")
-    if test.has_key('new_summary'):
-      self.run_test_case_i(text, subtitle_reg, test['new_summary'], "new_summary", match=2)
+      canonical, _, for_engine = key.partition(':')
+      if for_engine and for_engine != engine:
+        continue
+      if canonical in ('comment', 'summary', 'title'):
+        continue
+      if canonical in ('age', 'genre'):
+        print 'Test case contains key "{key}" which is not currently tested for "{test}"'.format(key=key, test=test)
+        continue
+      if canonical not in regexes:
+        print 'Test case contains invalid key "{key}" (possible typo) for "{test}"'.format(key=key, test=test)
+        raise SyntaxWarning('Test case contains invalid/unknown key {}'.format(key))
+      if for_engine:
+        keys_to_test.discard(canonical)
+        keys_to_test.add(key)
+      else:
+        if not engine or key + ':' + engine not in keys_to_test:
+          keys_to_test.add(key)
+
+    for key in keys_to_test:
+      canonical, _, _ = key.partition(':')
+      text = test['summary']
+      if canonical == 'new_title':
+        text = test['title'] + ' ' + text
+      if regexes[canonical]:
+        self.run_test_case_i(text, regexes[canonical], test[key], key)
+      else:
+        print 'FAIL: no regex defined for key "{key}"'.format(key=canonical)
+        self.num_failed = self.num_failed + 1
+
 
 def get_regs(parser, engine, key):
   try:
     l = parser[engine][key]
   except KeyError:
-    l = parser[key]
+    try:
+      l = parser[key]
+    except KeyError:
+      return None
   res = []
   for reg in l:
-    res.append(re.compile(reg))
+    res.append(Regex(engine, reg))
   return res
 
 def main(argv):
@@ -150,10 +191,13 @@ def main(argv):
   pprint.pprint(parser, indent=2)
 
   # Compile the regular expressions that we will use.
-  sn_reg = get_regs(parser, args.engine, 'season_num')
-  en_reg = get_regs(parser, args.engine, 'episode_num')
-  airdate_reg = get_regs(parser, args.engine, 'airdate')
-  subtitle_reg = get_regs(parser, args.engine, 'scrape_subtitle')
+  regexes = {}
+  regexes["season"] = get_regs(parser, args.engine, 'season_num')
+  regexes["episode"] = get_regs(parser, args.engine, 'episode_num')
+  regexes["airdate"] = get_regs(parser, args.engine, 'airdate')
+  regexes["new_title"] = get_regs(parser, args.engine, 'scrape_title')
+  regexes["new_subtitle"] = get_regs(parser, args.engine, 'scrape_subtitle')
+  regexes["new_summary"] = get_regs(parser, args.engine, 'scrape_summary')
 
   # Now parse the test file which is a JSON input file
   tests = json.load(args.scrapertestfile)
@@ -164,7 +208,7 @@ def main(argv):
   for test in tests['tests']:
     print "Running test" + str(test)
     pprint.pprint(test)
-    tester.run_test_case(test, sn_reg, en_reg, airdate_reg, subtitle_reg)
+    tester.run_test_case(args.engine, test, regexes)
 
   # And show the results
   print "\n\nSummary:\tNumOK: %s\tNumFailed: %s" %(tester.num_ok, tester.num_failed)
diff --git a/support/testdata/eitscrape/README b/support/testdata/eitscrape/README
index c95e36ed5..bc3bd0aea 100644
--- a/support/testdata/eitscrape/README
+++ b/support/testdata/eitscrape/README
@@ -8,18 +8,26 @@ It then outputs OK or FAIL for every test case.
 The format is JSON.
 
 Input:
-- "comment' - explanation for the test if there is something unusual
+- "comment" - explanation for the test if there is something unusual
       about it such as broadcaster reversing episode and season.
 
+- "title" - title field from EIT broadcast that will be scraped.
+
 - "summary" - summary field from EIT broadcast that will be scraped.
 
 
 Expected Result:
 
-- 'age', 'airdate', 'comment', 'episode', 'genre', 'season':
+- 'age', 'airdate', 'comment', 'episode', 'genre', 'season',
+  'new_title', 'new_subtitle', 'new_summary':
    expected scraped values. "null" is an accepted value such as '"season": null'
    for cases that should explicitly not set season.
 
    Currently we do not scrape age (12, 15, 18 etc) or genre but it is
    included since some broadcasters have this information in the
    summary and it may be supported in the future.
+
+- if different scraper values are expected for different regex engines,
+  give expected scraper values for a particular regex engine ('pcre',
+  'pcre1', 'pcre2') by appending a colon and the name of the engine
+  to the field name, e.g. 'new_title:pcre'.