eit: allow scraper regexes to be marked as filters (#4818)

author Jim Hague <jim.hague@acm.org>

Wed, 27 Dec 2017 14:55:02 +0000 (14:55 +0000)

committer Jaroslav Kysela <perex@perex.cz>

Thu, 28 Dec 2017 15:29:31 +0000 (16:29 +0100)
author Jim Hague <jim.hague@acm.org>
Wed, 27 Dec 2017 14:55:02 +0000 (14:55 +0000)
committer Jaroslav Kysela <perex@perex.cz>
Thu, 28 Dec 2017 15:29:31 +0000 (16:29 +0100)
diff --git a/data/conf/epggrab/eit/scrape/README b/data/conf/epggrab/eit/scrape/README

index 41ad82c3d0b16e04b43c189dc39307366e287253..8c57e63c1204f40a47219220a0e0e1ba21470568 100644 (file)
--- a/data/conf/epggrab/eit/scrape/README
+++ b/data/conf/epggrab/eit/scrape/README
@@ -24,7 +24,7 @@ of the match is the contents of all the sub-patterns in the regular
  expression concatenated together.
  
  For each EPG episode, the title, description and summary are matched
-in turn against the season_num, episode_num, airdate and is_new regexes.
+in turn against the season_num, episode_num, airdate and is_new regex lists.
  
  - season_num converts the contents of the match result to an integer,
    and if successful sets the EPG season number.
@@ -37,14 +37,56 @@ in turn against the season_num, episode_num, airdate and is_new regexes.
    the match result is ignored.
  
  Next, a combined title/summary text is made by joining the title, a space,
-and the summary. The combined text is matched against the scrape_title regex.
-On a match, the EPG title is set to the match result.
+and the summary. The combined text is matched against the scrape_title regex
+list. On a match, the EPG title is set to the match result.
  
-Then the summary is matched against the scrape_subtitle regex. On a match,
+Then the summary is matched against the scrape_subtitle regex list. On a match,
  the EPG subtitle is set to the match result.
  
-Finally, the summary is matched against the scrape_summary regex. On a match,
-the EPG summary is set to the match result.
+Finally, the summary is matched against the scrape_summary regex list. On a
+match, the EPG summary is set to the match result.
+
+Filtering regular expressions
+-----------------------------
+
+Any regular expression in a list can be marked as a filtering regular
+expression. If the regular expression is marked as a filter, and it matches
+the input text, then the match result is not returned as a result, but
+instead replaces the original text to match, and matching continues with the
+next regular expression in the list. If a filter regular expression does
+not match, matching moves to the next regular expression in the list as
+usual.
+
+To mark a regular expression as a filter, it must be specified with an
+expanded definition with a "pattern" component. This is the regular expression
+pattern. It may also have an optional numeric "filter" component. If present,
+and not 0, the regular expression is a filter.
+
+For example, in the following list, the first regex is a filter that
+removes any first sentence starting "...". The following regexs see
+only the text following that sentence.
+
+{
+  "scrape_subtitle": [
+      {
+          "pattern": "^[.][.][.][^:.?!]*[.:?!] +(.*)",
+          "filter": 1
+      },
+      {
+          "pattern": "^[0-9]+/[0-9]+[.] +(.*)",
+          "filter": 1
+      },
+      "^([^:]+): "
+  ]
+}
+
+Given any of the following input texts, the above regex list matches
+'Subtitle here':
+
+...Continued title. 1/6. Subtitle here: rest of summary
+...Continued title. Subtitle here: rest of summary
+1/6. Subtitle here: rest of summary
+Subtitle here: rest of summary
  
  Regular expression engine
  -------------------------
diff --git a/src/epggrab/module/eitpatternlist.c b/src/epggrab/module/eitpatternlist.c

index 15c1aeb42ae9c1237a28de12c46d906f4ba9fce1..9511413036b0ded27235d22ceaa7b850f3c6673b 100644 (file)
--- a/src/epggrab/module/eitpatternlist.c
+++ b/src/epggrab/module/eitpatternlist.c
@@ -22,25 +22,36 @@
  #include "eitpatternlist.h"
  #include "htsmsg.h"
  
+#define MAX_TEXT_LEN    2048
+
  void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l, int flags )
  {
    eit_pattern_t *pattern;
    htsmsg_field_t *f;
    const char *s;
+  int filter;
  
    TAILQ_INIT(list);
    if (!l) return;
    HTSMSG_FOREACH(f, l) {
      s = htsmsg_field_get_str(f);
-    if (s == NULL) continue;
+    filter = 0;
+    if (s == NULL) {
+      htsmsg_t *m = htsmsg_field_get_map(f);
+      if (m == NULL) continue;
+      s = htsmsg_get_str(m, "pattern");
+      if (s == NULL) continue;
+      filter = htsmsg_get_bool_or_default(m, "filter", 0);
+    }
      pattern = calloc(1, sizeof(eit_pattern_t));
      pattern->text = strdup(s);
+    pattern->filter = filter;
      if (regex_compile(&pattern->compiled, pattern->text, flags, LS_EPGGRAB)) {
        tvhwarn(LS_EPGGRAB, "error compiling pattern \"%s\"", pattern->text);
        free(pattern->text);
        free(pattern);
      } else {
-      tvhtrace(LS_EPGGRAB, "compiled pattern \"%s\"", pattern->text);
+      tvhtrace(LS_EPGGRAB, "compiled pattern \"%s\", filter %d", pattern->text, pattern->filter);
        TAILQ_INSERT_TAIL(list, pattern, p_links);
      }
    }
@@ -74,7 +85,8 @@ static void rtrim(char *buf)
  void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l)
  {
    eit_pattern_t *p;
-  char matchbuf[2048];
+  char textbuf[MAX_TEXT_LEN];
+  char matchbuf[MAX_TEXT_LEN];
    int matchno;
  
    assert(buf);
@@ -93,7 +105,13 @@ void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_p
          strncat(buf, matchbuf, size_buf - len - 1);
        }
        rtrim(buf);
-      tvhtrace(LS_EPGGRAB,"  pattern \"%s\" matches with '%s'", p->text, buf);
+      tvhtrace(LS_EPGGRAB,"  pattern \"%s\" matches '%s' from '%s'", p->text, buf, text);
+      if (p->filter) {
+        strncpy(textbuf, buf, MAX_TEXT_LEN - 1);
+        textbuf[MAX_TEXT_LEN - 1] = '\0';
+        text = textbuf;
+        continue;
+      }
        return buf;
      }
    return NULL;
diff --git a/src/epggrab/module/eitpatternlist.h b/src/epggrab/module/eitpatternlist.h

index dc6a02416df60bad37813f17267756fde59e1c39..c7c24519b1f447024850397b1d15826de4326bdd 100644 (file)
--- a/src/epggrab/module/eitpatternlist.h
+++ b/src/epggrab/module/eitpatternlist.h
@@ -26,6 +26,7 @@ typedef struct eit_pattern
  {
    char                        *text;
    tvh_regex_t                 compiled;
+  int                         filter;
    TAILQ_ENTRY(eit_pattern)    p_links;
  } eit_pattern_t;
  
diff --git a/support/eitscrape_test.py b/support/eitscrape_test.py

index af886b90111a982c402262a55122c8a0263e3d42..19233ec525ef4b7ed68b6e4e3434805361cce1d0 100755 (executable)
--- a/support/eitscrape_test.py
+++ b/support/eitscrape_test.py
@@ -64,11 +64,16 @@ except ImportError:
  class Regex(object):
    def __init__(self, engine, regex):
      self.engine = engine
-    self.regex = regex
+    if isinstance(regex, dict):
+      self.regex = regex["pattern"]
+      self.re_is_filter = (regex["filter"] != 0)
+    else:
+      self.regex = regex
+      self.re_is_filter = False
      flags = re_base_flag
      if not engine:
        flags |= re_posix_flag
-    self.regcomp = re.compile(regex, flags)
+    self.regcomp = re.compile(self.regex, flags)
  
    def search(self, text):
      match = self.regcomp.search(text)
@@ -101,6 +106,9 @@ class EITScrapeTest(object):
      for regex in regexes:
        result = regex.search(text)
        if result is not None:
+        if regex.re_is_filter:
+          text = result
+          continue
          if result == expect:
            print 'OK: Got correct result of "{result}" testing "{testing}" for "{text}" using "{pattern}"'.format(result=result, testing=testing, text=text, pattern=regex.text())
            self.num_ok = self.num_ok + 1
author	Jim Hague <jim.hague@acm.org>
	Wed, 27 Dec 2017 14:55:02 +0000 (14:55 +0000)
committer	Jaroslav Kysela <perex@perex.cz>
	Thu, 28 Dec 2017 15:29:31 +0000 (16:29 +0100)
data/conf/epggrab/eit/scrape/README		patch \| blob \| blame \| history
src/epggrab/module/eitpatternlist.c		patch \| blob \| blame \| history
src/epggrab/module/eitpatternlist.h		patch \| blob \| blame \| history
support/eitscrape_test.py		patch \| blob \| blame \| history