From: Jim Hague <jim.hague@acm.org>
Date: Thu, 28 Dec 2017 11:16:50 +0000 (+0000)
Subject: eit: update UK scrapers to use new facilities (#4818)
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=8b46f62bef420aeab591f1f5094ab461d73914ea;p=thirdparty%2Ftvheadend.git

eit: update UK scrapers to use new facilities (#4818)

Refresh the UK scrapers to exploit recent mechanism changes. Add title
and summary scrapers, alternate PCRE patterns that offer a little more
sophistication, and make use of filter patterns to simplify matching a
little.

Issue: #4818
---

diff --git a/data/conf/epggrab/eit/scrape/README.uk b/data/conf/epggrab/eit/scrape/README.uk
new file mode 100644
index 000000000..042caf1a5
--- /dev/null
+++ b/data/conf/epggrab/eit/scrape/README.uk
@@ -0,0 +1,61 @@
+Scrapers for UK Freeview/Freesat
+================================
+
+The 'uk' scraper set provides scrapers for use with UK Freeview DVB-T
+and Freesat DVB-S EPG.
+
+The title, subtitle and summary scrapers are designed to work together, so
+for best results, all three should be enabled. They try to ensure that when
+text is selected for use in, say, the title, it is not selected for the
+subtitle or the summary. In other words, the same text should not be repeated
+in other fields.
+
+The title, subtitle and text patterns attempt to populate the title,
+subtitle and summary from the EIT title and summary fields.
+
+1. If a title ends '...' and the summary starts '...', the start of the
+   summary is treated as a continuation of the title. The first sentence
+   is extracted and joined to the title with a space. The '...' are removed.
+   The used sentence is removed from the subtitle and summary.
+
+2. If the summary starts with a clause ending ':', or a short sentence
+   of between 4 and 70 characters, that is used as the subtitle.
+   It may be preceded by an episode number or New flag.
+
+3. Remaining text is used as the summary.
+
+POSIX and PCRE
+--------------
+
+TvHeadend is often compiled with the PCRE regular expression library;
+for example, this is the case in the distributed Debian packages. If so,
+alternate PCRE regular expressions are provided. These use PCRE facilities,
+notably lookbehinds and non-capturing groups, to attempt a little more
+sophistication in scraping.
+
+1. If episode and New flag occur at the start of the input summary data,
+   the scraped summary will contain both. With POSIX only the last is retained.
+
+2. A '.' followed by a space only marks the end of a sentence if it
+   does not occur in a small set of known patterns, e.g. Dr. or Ms.,
+   or if it does not follow a character preceded by another '.'. In
+   other words, an acronym such as 'S.H.I.E.L.D.' will not terminate
+   a sentence.
+
+Tests
+-----
+
+Test data is provided under support/eitscrape. Tests can be run from the
+support directory. Test the PCRE patterns with:
+
+tvheadend/support$ ./eitscrape_test.py --pcre ../data/conf/epggrab/eit/scrape/uk testdata/eitscrape/uk
+
+and the POSIX patterns with:
+
+tvheadend/support$ ./eitscrape_test.py ../data/conf/epggrab/eit/scrape/uk testdata/eitscrape/uk
+
+NOTE: To test the PCRE patterns, you MUST have the Python 'regex' package
+installed. On Debian-derived systems, this is packaged as 'python-regex'.
+The default Python 're' package is not sufficiently compatible with PCRE;
+specifically, it does not permit variable-length lookbehinds, and so fails
+to compile the PCRE patterns.
diff --git a/data/conf/epggrab/eit/scrape/uk b/data/conf/epggrab/eit/scrape/uk
index d7286ba47..de3191ff0 100644
--- a/data/conf/epggrab/eit/scrape/uk
+++ b/data/conf/epggrab/eit/scrape/uk
@@ -15,19 +15,67 @@
     "^([0-9]+)/[0-9]+[.] "
   ],
   "airdate": [
-    "\\(([0-9][0-9][0-9][0-9])\\)"
+    "\\(([0-9]{4})\\)",
+    ", ([0-9]{4})[.]",
+    "^([0-9]{4})[.]"
+  ],
+  "is_new" : [
+      "New[.:] "
+  ],
+  "scrape_title": [
+      "^(.+)[.]{3}( )[.]{3}([^.?!:]+)(?:([?!])|[:.])"
   ],
   "scrape_subtitle": [
-      "^[.][.][.][^:.]*[.:] ([^.0-9][^:]*): ",
-      "^[0-9]+/[0-9]+[.] +([^:]*): ",
-      "^([^.0-9][^:]+): "
+    {
+      "pattern": "^[.]{3}[^.?!:]+[.?!:] +(.*)",
+      "filter": 1
+    },
+    {
+      "pattern": "^[S 0-9]+,?[ /][Eep 0-9]+[.]? +(.*)",
+      "filter": 1
+    },
+    {
+      "pattern": "^New[.:] +(.*)",
+      "filter": 1
+    },
+    "^([^.?!:]+[!?]?): ",
+    "^([A-Z][^.?!]{4,70}[.?!]) ",
+    "()"
   ],
-  "is_new" : [
-      "^(New: )"
+  "scrape_summary": [
+    {
+      "pattern": "^[.]{3}[^.?!:]+[.?!:] +(.*)",
+      "filter": 1
+    },
+    "^([S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )* *[^.?!:]+[!?]?: (.*)",
+    "^([S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )* *[A-Z][^.?!]{4,70}[.?!] (.*)",
+    "(.*)"
   ],
   "pcre": {
-      "scrape_subtitle": [
-          "^(?:[.][.][.][^:.]*[.:] +|[0-9]+/[0-9]+[.] +)?([^.0-9][^:]*): "
-      ]
+    "scrape_title": [
+      "^(.+)[.]{3}( )[.]{3}(.*?)(?:([?!])|[:]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.]) "
+    ],
+    "scrape_subtitle": [
+      {
+        "pattern": "^[.]{3}.*?(?:[?!:]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.]) +(.*)",
+        "filter": 1
+      },
+      {
+        "pattern": "^(?:(?:[S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )*) *(.*)",
+        "filter": 1
+      },
+      "^([^.?!:]+[!?]?): ",
+      "^([A-Z].{4,70}?(?:[?!]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.])) ",
+      "()"
+    ],
+    "scrape_summary": [
+      {
+        "pattern": "^[.]{3}.*?(?:[?!:]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.]) +(.*)",
+        "filter": 1
+      },
+      "^((?:[S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )*) *[^.?!:]+[!?]?: +(.*)",
+      "^((?:[S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )*) *[A-Z].{4,70}?(?:[?!]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.]) +(.*)",
+      "(.*)"
+    ]
   }
 }
diff --git a/support/testdata/eitscrape/uk b/support/testdata/eitscrape/uk
index 0b8dd0156..e2a932f88 100644
--- a/support/testdata/eitscrape/uk
+++ b/support/testdata/eitscrape/uk
@@ -11,14 +11,53 @@
 
 "tests" : [
     {
+        "title": "Title Start...",
         "summary": "...Title Continuation Here: Lorem Ipsum (S2 Ep1)",
         "season": "2", "episode": "1",
-        "new_subtitle": null
+        "new_title": "Title Start Title Continuation Here",
+        "new_subtitle": "",
+        "new_summary": "Lorem Ipsum (S2 Ep1)"
     },
     {
+        "title": "Title Start...",
+        "summary": "...Title Continuation Here. Lorem Ipsum (S2 Ep1)",
+        "season": "2", "episode": "1",
+        "new_title": "Title Start Title Continuation Here",
+        "new_subtitle": "",
+        "new_summary": "Lorem Ipsum (S2 Ep1)"
+    },
+    {
+        "title": "Title Start...",
+        "summary": "...Dr. Solo from U.N.C.L.E. investigates. Lorem Ipsum (S2 Ep1)",
+        "season": "2", "episode": "1",
+        "new_title": "Title Start Dr",
+        "new_title:pcre": "Title Start Dr. Solo from U.N.C.L.E. investigates",
+        "new_subtitle": "",
+        "new_summary": "Solo from U.N.C.L.E. investigates. Lorem Ipsum (S2 Ep1)",
+        "new_summary:pcre": "Lorem Ipsum (S2 Ep1)"
+    },
+    {
+        "title": "Title Start...",
+        "summary": "...Title Continuation Here? Lorem Ipsum (S2 Ep1)",
+        "season": "2", "episode": "1",
+        "new_title": "Title Start Title Continuation Here?",
+        "new_subtitle": "",
+        "new_summary": "Lorem Ipsum (S2 Ep1)"
+    },
+    {
+        "title": "Title Start...",
         "summary": "...TitleContinue. Subtitle Here After Title Continue: Lorem Ipsum. (S1 Ep3)",
         "season": "1", "episode": "3",
-        "new_subtitle": "Subtitle Here After Title Continue"
+        "new_title": "Title Start TitleContinue",
+        "new_subtitle": "Subtitle Here After Title Continue",
+        "new_summary": "Lorem Ipsum. (S1 Ep3)"
+    },
+    {
+        "title": "Title Start...",
+        "summary": "...TitleContinue. Subtitle Here After Title Continue. Lorem Ipsum. (S1 Ep3)",
+        "season": "1", "episode": "3",
+        "new_subtitle": "Subtitle Here After Title Continue.",
+        "new_summary": "Lorem Ipsum. (S1 Ep3)"
     },
     {
         "summary": "Lorem Ipsum. (S1 Ep 21)[S]",
@@ -37,9 +76,11 @@
         "season": "5", "episode": "31"
     },
     {
-        "summary": "5/6. Sub Title Here?: Lorem Ipsum.. [HD] [AD,S]",
+        "summary": "5/6. New: Sub Title Here?: Lorem Ipsum. [HD] [AD,S]",
         "season": null, "episode": "5",
-        "new_subtitle": "Sub Title Here?"
+        "new_subtitle": "Sub Title Here?",
+        "new_summary": "New: Lorem Ipsum. [HD] [AD,S]",
+        "new_summary:pcre": "5/6. New: Lorem Ipsum. [HD] [AD,S]"
     },
     {
         "summary": "Lorem Ipsum. (S8, ep 5) [S,AD]",
@@ -50,6 +91,12 @@
         "season": "8", "episode": "17",
         "new_subtitle" : "Subtitle, More Subtitle - Part 1"
     },
+    {
+        "summary": "1948. Director: Fred Nurke. Lorem Ipsum. (S8 Ep17)",
+        "season": "8", "episode": "17",
+        "new_subtitle" : "",
+        "airdate": "1948"
+    },
     {
         "comment": "No space between episode's period and start of text.",
         "summary": "S18 E4.Lorem Ipsum",