From: Jim Hague Date: Thu, 28 Dec 2017 11:16:50 +0000 (+0000) Subject: eit: update UK scrapers to use new facilities (#4818) X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=8b46f62bef420aeab591f1f5094ab461d73914ea;p=thirdparty%2Ftvheadend.git eit: update UK scrapers to use new facilities (#4818) Refresh the UK scrapers to exploit recent mechanism changes. Add title and summary scrapers, alternate PCRE patterns that offer a little more sophistication, and make use of filter patterns to simplify matching a little. Issue: #4818 --- diff --git a/data/conf/epggrab/eit/scrape/README.uk b/data/conf/epggrab/eit/scrape/README.uk new file mode 100644 index 000000000..042caf1a5 --- /dev/null +++ b/data/conf/epggrab/eit/scrape/README.uk @@ -0,0 +1,61 @@ +Scrapers for UK Freeview/Freesat +================================ + +The 'uk' scraper set provides scrapers for use with UK Freeview DVB-T +and Freesat DVB-S EPG. + +The title, subtitle and summary scrapers are designed to work together, so +for best results, all three should be enabled. They try to ensure that when +text is selected for use in, say, the title, it is not selected for the +subtitle or the summary. In other words, the same text should not be repeated +in other fields. + +The title, subtitle and text patterns attempt to populate the title, +subtitle and summary from the EIT title and summary fields. + +1. If a title ends '...' and the summary starts '...', the start of the + summary is treated as a continuation of the title. The first sentence + is extracted and joined to the title with a space. The '...' are removed. + The used sentence is removed from the subtitle and summary. + +2. If the summary starts with a clause ending ':', or a short sentence + of between 4 and 70 characters, that is used as the subtitle. + It may be preceded by an episode number or New flag. + +3. Remaining text is used as the summary. + +POSIX and PCRE +-------------- + +TvHeadend is often compiled with the PCRE regular expression library; +for example, this is the case in the distributed Debian packages. If so, +alternate PCRE regular expressions are provided. These use PCRE facilities, +notably lookbehinds and non-capturing groups, to attempt a little more +sophistication in scraping. + +1. If episode and New flag occur at the start of the input summary data, + the scraped summary will contain both. With POSIX only the last is retained. + +2. A '.' followed by a space only marks the end of a sentence if it + does not occur in a small set of known patterns, e.g. Dr. or Ms., + or if it does not follow a character preceded by another '.'. In + other words, an acronym such as 'S.H.I.E.L.D.' will not terminate + a sentence. + +Tests +----- + +Test data is provided under support/eitscrape. Tests can be run from the +support directory. Test the PCRE patterns with: + +tvheadend/support$ ./eitscrape_test.py --pcre ../data/conf/epggrab/eit/scrape/uk testdata/eitscrape/uk + +and the POSIX patterns with: + +tvheadend/support$ ./eitscrape_test.py ../data/conf/epggrab/eit/scrape/uk testdata/eitscrape/uk + +NOTE: To test the PCRE patterns, you MUST have the Python 'regex' package +installed. On Debian-derived systems, this is packaged as 'python-regex'. +The default Python 're' package is not sufficiently compatible with PCRE; +specifically, it does not permit variable-length lookbehinds, and so fails +to compile the PCRE patterns. diff --git a/data/conf/epggrab/eit/scrape/uk b/data/conf/epggrab/eit/scrape/uk index d7286ba47..de3191ff0 100644 --- a/data/conf/epggrab/eit/scrape/uk +++ b/data/conf/epggrab/eit/scrape/uk @@ -15,19 +15,67 @@ "^([0-9]+)/[0-9]+[.] " ], "airdate": [ - "\\(([0-9][0-9][0-9][0-9])\\)" + "\\(([0-9]{4})\\)", + ", ([0-9]{4})[.]", + "^([0-9]{4})[.]" + ], + "is_new" : [ + "New[.:] " + ], + "scrape_title": [ + "^(.+)[.]{3}( )[.]{3}([^.?!:]+)(?:([?!])|[:.])" ], "scrape_subtitle": [ - "^[.][.][.][^:.]*[.:] ([^.0-9][^:]*): ", - "^[0-9]+/[0-9]+[.] +([^:]*): ", - "^([^.0-9][^:]+): " + { + "pattern": "^[.]{3}[^.?!:]+[.?!:] +(.*)", + "filter": 1 + }, + { + "pattern": "^[S 0-9]+,?[ /][Eep 0-9]+[.]? +(.*)", + "filter": 1 + }, + { + "pattern": "^New[.:] +(.*)", + "filter": 1 + }, + "^([^.?!:]+[!?]?): ", + "^([A-Z][^.?!]{4,70}[.?!]) ", + "()" ], - "is_new" : [ - "^(New: )" + "scrape_summary": [ + { + "pattern": "^[.]{3}[^.?!:]+[.?!:] +(.*)", + "filter": 1 + }, + "^([S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )* *[^.?!:]+[!?]?: (.*)", + "^([S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )* *[A-Z][^.?!]{4,70}[.?!] (.*)", + "(.*)" ], "pcre": { - "scrape_subtitle": [ - "^(?:[.][.][.][^:.]*[.:] +|[0-9]+/[0-9]+[.] +)?([^.0-9][^:]*): " - ] + "scrape_title": [ + "^(.+)[.]{3}( )[.]{3}(.*?)(?:([?!])|[:]|(?