--- /dev/null
+Scrapers for UK Freeview/Freesat
+================================
+
+The 'uk' scraper set provides scrapers for use with UK Freeview DVB-T
+and Freesat DVB-S EPG.
+
+The title, subtitle and summary scrapers are designed to work together, so
+for best results, all three should be enabled. They try to ensure that when
+text is selected for use in, say, the title, it is not selected for the
+subtitle or the summary. In other words, the same text should not be repeated
+in other fields.
+
+The title, subtitle and text patterns attempt to populate the title,
+subtitle and summary from the EIT title and summary fields.
+
+1. If a title ends '...' and the summary starts '...', the start of the
+ summary is treated as a continuation of the title. The first sentence
+ is extracted and joined to the title with a space. The '...' are removed.
+ The used sentence is removed from the subtitle and summary.
+
+2. If the summary starts with a clause ending ':', or a short sentence
+ of between 4 and 70 characters, that is used as the subtitle.
+ It may be preceded by an episode number or New flag.
+
+3. Remaining text is used as the summary.
+
+POSIX and PCRE
+--------------
+
+TvHeadend is often compiled with the PCRE regular expression library;
+for example, this is the case in the distributed Debian packages. If so,
+alternate PCRE regular expressions are provided. These use PCRE facilities,
+notably lookbehinds and non-capturing groups, to attempt a little more
+sophistication in scraping.
+
+1. If episode and New flag occur at the start of the input summary data,
+ the scraped summary will contain both. With POSIX only the last is retained.
+
+2. A '.' followed by a space only marks the end of a sentence if it
+ does not occur in a small set of known patterns, e.g. Dr. or Ms.,
+ or if it does not follow a character preceded by another '.'. In
+ other words, an acronym such as 'S.H.I.E.L.D.' will not terminate
+ a sentence.
+
+Tests
+-----
+
+Test data is provided under support/eitscrape. Tests can be run from the
+support directory. Test the PCRE patterns with:
+
+tvheadend/support$ ./eitscrape_test.py --pcre ../data/conf/epggrab/eit/scrape/uk testdata/eitscrape/uk
+
+and the POSIX patterns with:
+
+tvheadend/support$ ./eitscrape_test.py ../data/conf/epggrab/eit/scrape/uk testdata/eitscrape/uk
+
+NOTE: To test the PCRE patterns, you MUST have the Python 'regex' package
+installed. On Debian-derived systems, this is packaged as 'python-regex'.
+The default Python 're' package is not sufficiently compatible with PCRE;
+specifically, it does not permit variable-length lookbehinds, and so fails
+to compile the PCRE patterns.
"^([0-9]+)/[0-9]+[.] "
],
"airdate": [
- "\\(([0-9][0-9][0-9][0-9])\\)"
+ "\\(([0-9]{4})\\)",
+ ", ([0-9]{4})[.]",
+ "^([0-9]{4})[.]"
+ ],
+ "is_new" : [
+ "New[.:] "
+ ],
+ "scrape_title": [
+ "^(.+)[.]{3}( )[.]{3}([^.?!:]+)(?:([?!])|[:.])"
],
"scrape_subtitle": [
- "^[.][.][.][^:.]*[.:] ([^.0-9][^:]*): ",
- "^[0-9]+/[0-9]+[.] +([^:]*): ",
- "^([^.0-9][^:]+): "
+ {
+ "pattern": "^[.]{3}[^.?!:]+[.?!:] +(.*)",
+ "filter": 1
+ },
+ {
+ "pattern": "^[S 0-9]+,?[ /][Eep 0-9]+[.]? +(.*)",
+ "filter": 1
+ },
+ {
+ "pattern": "^New[.:] +(.*)",
+ "filter": 1
+ },
+ "^([^.?!:]+[!?]?): ",
+ "^([A-Z][^.?!]{4,70}[.?!]) ",
+ "()"
],
- "is_new" : [
- "^(New: )"
+ "scrape_summary": [
+ {
+ "pattern": "^[.]{3}[^.?!:]+[.?!:] +(.*)",
+ "filter": 1
+ },
+ "^([S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )* *[^.?!:]+[!?]?: (.*)",
+ "^([S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )* *[A-Z][^.?!]{4,70}[.?!] (.*)",
+ "(.*)"
],
"pcre": {
- "scrape_subtitle": [
- "^(?:[.][.][.][^:.]*[.:] +|[0-9]+/[0-9]+[.] +)?([^.0-9][^:]*): "
- ]
+ "scrape_title": [
+ "^(.+)[.]{3}( )[.]{3}(.*?)(?:([?!])|[:]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.]) "
+ ],
+ "scrape_subtitle": [
+ {
+ "pattern": "^[.]{3}.*?(?:[?!:]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.]) +(.*)",
+ "filter": 1
+ },
+ {
+ "pattern": "^(?:(?:[S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )*) *(.*)",
+ "filter": 1
+ },
+ "^([^.?!:]+[!?]?): ",
+ "^([A-Z].{4,70}?(?:[?!]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.])) ",
+ "()"
+ ],
+ "scrape_summary": [
+ {
+ "pattern": "^[.]{3}.*?(?:[?!:]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.]) +(.*)",
+ "filter": 1
+ },
+ "^((?:[S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )*) *[^.?!:]+[!?]?: +(.*)",
+ "^((?:[S 0-9]+,?[ /][Eep 0-9]+[.]? |New[.:] )*) *[A-Z].{4,70}?(?:[?!]|(?<!Dr|Prof|Rev|Mr|Mrs|Ms|[.][^.])[.]) +(.*)",
+ "(.*)"
+ ]
}
}
"tests" : [
{
+ "title": "Title Start...",
"summary": "...Title Continuation Here: Lorem Ipsum (S2 Ep1)",
"season": "2", "episode": "1",
- "new_subtitle": null
+ "new_title": "Title Start Title Continuation Here",
+ "new_subtitle": "",
+ "new_summary": "Lorem Ipsum (S2 Ep1)"
},
{
+ "title": "Title Start...",
+ "summary": "...Title Continuation Here. Lorem Ipsum (S2 Ep1)",
+ "season": "2", "episode": "1",
+ "new_title": "Title Start Title Continuation Here",
+ "new_subtitle": "",
+ "new_summary": "Lorem Ipsum (S2 Ep1)"
+ },
+ {
+ "title": "Title Start...",
+ "summary": "...Dr. Solo from U.N.C.L.E. investigates. Lorem Ipsum (S2 Ep1)",
+ "season": "2", "episode": "1",
+ "new_title": "Title Start Dr",
+ "new_title:pcre": "Title Start Dr. Solo from U.N.C.L.E. investigates",
+ "new_subtitle": "",
+ "new_summary": "Solo from U.N.C.L.E. investigates. Lorem Ipsum (S2 Ep1)",
+ "new_summary:pcre": "Lorem Ipsum (S2 Ep1)"
+ },
+ {
+ "title": "Title Start...",
+ "summary": "...Title Continuation Here? Lorem Ipsum (S2 Ep1)",
+ "season": "2", "episode": "1",
+ "new_title": "Title Start Title Continuation Here?",
+ "new_subtitle": "",
+ "new_summary": "Lorem Ipsum (S2 Ep1)"
+ },
+ {
+ "title": "Title Start...",
"summary": "...TitleContinue. Subtitle Here After Title Continue: Lorem Ipsum. (S1 Ep3)",
"season": "1", "episode": "3",
- "new_subtitle": "Subtitle Here After Title Continue"
+ "new_title": "Title Start TitleContinue",
+ "new_subtitle": "Subtitle Here After Title Continue",
+ "new_summary": "Lorem Ipsum. (S1 Ep3)"
+ },
+ {
+ "title": "Title Start...",
+ "summary": "...TitleContinue. Subtitle Here After Title Continue. Lorem Ipsum. (S1 Ep3)",
+ "season": "1", "episode": "3",
+ "new_subtitle": "Subtitle Here After Title Continue.",
+ "new_summary": "Lorem Ipsum. (S1 Ep3)"
},
{
"summary": "Lorem Ipsum. (S1 Ep 21)[S]",
"season": "5", "episode": "31"
},
{
- "summary": "5/6. Sub Title Here?: Lorem Ipsum.. [HD] [AD,S]",
+ "summary": "5/6. New: Sub Title Here?: Lorem Ipsum. [HD] [AD,S]",
"season": null, "episode": "5",
- "new_subtitle": "Sub Title Here?"
+ "new_subtitle": "Sub Title Here?",
+ "new_summary": "New: Lorem Ipsum. [HD] [AD,S]",
+ "new_summary:pcre": "5/6. New: Lorem Ipsum. [HD] [AD,S]"
},
{
"summary": "Lorem Ipsum. (S8, ep 5) [S,AD]",
"season": "8", "episode": "17",
"new_subtitle" : "Subtitle, More Subtitle - Part 1"
},
+ {
+ "summary": "1948. Director: Fred Nurke. Lorem Ipsum. (S8 Ep17)",
+ "season": "8", "episode": "17",
+ "new_subtitle" : "",
+ "airdate": "1948"
+ },
{
"comment": "No space between episode's period and start of text.",
"summary": "S18 E4.Lorem Ipsum",