From: Jim Hague Date: Sun, 17 Dec 2017 00:48:23 +0000 (+0000) Subject: eit: add title and summary scrapers (#4801) X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=07849161f5e955abfaf8eba893a086cf150abe9e;p=thirdparty%2Ftvheadend.git eit: add title and summary scrapers (#4801) Since this change adds a summary scraper, remove the recently added summary update from the second match subgroup and instead build the match from each scraper by concatenating all matching subgroups. This lets us pick multiple items from the input. Issue: #4801 --- diff --git a/data/conf/epggrab/eit/scrape/README b/data/conf/epggrab/eit/scrape/README index 51972c4d6..41ad82c3d 100644 --- a/data/conf/epggrab/eit/scrape/README +++ b/data/conf/epggrab/eit/scrape/README @@ -11,32 +11,40 @@ object are: * episode_num * airdate * is_new +* scrape_title * scrape_subtitle +* scrape_summary Each member's value is a list of regular expressions. Each regular expression must contain at least one sub-pattern, i.e. a pattern enclosed in (). Input data is matched against the first regex in the list. If no match is found, the second regex is tried, and so on until -a match is found or the list exhausted. +a match is found or the list exhausted. If a match is found, the result +of the match is the contents of all the sub-patterns in the regular +expression concatenated together. For each EPG episode, the title, description and summary are matched in turn against the season_num, episode_num, airdate and is_new regexes. -- season_num converts the contents of the first sub-pattern to an integer, +- season_num converts the contents of the match result to an integer, and if successful sets the EPG season number. -- episode_num converts the contents of the first sub-pattern to an integer, +- episode_num converts the contents of the match result to an integer, and if successful sets the EPG eipsode number. -- airdate converts the contents of the first sub-pattern to an integer, +- airdate converts the contents of the match result to an integer, and if successful sets the EPG copyright year. - is_new sets the EPG is_new flag on any match. Remember the regexp must - have one sub-pattern to make a successful match; in this case the content - of the sub-pattern is ignored. - -Finally, the summary only is matched against the scrape_subtitle regexs. -On an match, the EPG subtitle is set to the contents of the first sub-pattern. -If a second sub-pattern is present in the regex, the EPG summary is set to -the contents of that sub-pattern. If no second sub-pattern is present, the -EPG summary is not changed. + have at least one sub-pattern to make a successful match; in this case + the match result is ignored. + +Next, a combined title/summary text is made by joining the title, a space, +and the summary. The combined text is matched against the scrape_title regex. +On a match, the EPG title is set to the match result. + +Then the summary is matched against the scrape_subtitle regex. On a match, +the EPG subtitle is set to the match result. + +Finally, the summary is matched against the scrape_summary regex. On a match, +the EPG summary is set to the match result. Regular expression engine ------------------------- diff --git a/src/epggrab.h b/src/epggrab.h index a7e6a0c50..9c76624ea 100644 --- a/src/epggrab.h +++ b/src/epggrab.h @@ -290,7 +290,9 @@ struct epggrab_module_ota_scraper epggrab_module_ota_t ; ///< Parent object char *scrape_config; ///< Config to use or blank/NULL for default. int scrape_episode; ///< Scrape season/episode from EIT summary + int scrape_title; ///< Scrape title from EIT title + summary int scrape_subtitle;///< Scrape subtitle from EIT summary + int scrape_summary; ///< Scrape summary from EIT summary }; /* diff --git a/src/epggrab/module.c b/src/epggrab/module.c index 5808b0d56..9e5021380 100644 --- a/src/epggrab/module.c +++ b/src/epggrab/module.c @@ -299,6 +299,19 @@ const idclass_t epggrab_mod_ota_scraper_class = { .off = offsetof(epggrab_module_ota_scraper_t, scrape_episode), .group = 2, }, + { + .type = PT_BOOL, + .id = "scrape_title", + .name = N_("Scrape Title"), + .desc = N_("Enable/disable scraping title from the programme title and description. " + "Some broadcasters can split the title over the separate title, " + "and summary fields. This allows scraping of common split title formats " + "from within the broadcast title and summary field if supported by the " + "configuration file." + ), + .off = offsetof(epggrab_module_ota_scraper_t, scrape_title), + .group = 2, + }, { .type = PT_BOOL, .id = "scrape_subtitle", @@ -312,6 +325,19 @@ const idclass_t epggrab_mod_ota_scraper_class = { .off = offsetof(epggrab_module_ota_scraper_t, scrape_subtitle), .group = 2, }, + { + .type = PT_BOOL, + .id = "scrape_summary", + .name = N_("Scrape Summary"), + .desc = N_("Enable/disable scraping summary from the programme description. " + "Some broadcasters do not send separate title, subtitle, description, " + "and summary fields. This allows scraping of a modified summary " + "from within the broadcast summary field if supported by the " + "configuration file." + ), + .off = offsetof(epggrab_module_ota_scraper_t, scrape_summary), + .group = 2, + }, {} } }; diff --git a/src/epggrab/module/eit.c b/src/epggrab/module/eit.c index d343c2f22..88bab322f 100644 --- a/src/epggrab/module/eit.c +++ b/src/epggrab/module/eit.c @@ -67,7 +67,9 @@ typedef struct eit_module_t eit_pattern_list_t p_snum; eit_pattern_list_t p_enum; eit_pattern_list_t p_airdate; ///< Original air date parser + eit_pattern_list_t p_scrape_title; ///< Scrape title from title + summary data eit_pattern_list_t p_scrape_subtitle;///< Scrape subtitle from summary data + eit_pattern_list_t p_scrape_summary; ///< Scrape summary from summary data eit_pattern_list_t p_is_new; ///< Is programme new to air } eit_module_t; @@ -513,41 +515,63 @@ _eit_scrape_episode(lang_str_t *str, } } -/* Scrape subtitle data from the broadcast data. - * @param text - string from broadcaster to search for all languages. +/* Scrape title/subtitle/summary data from the broadcast data. * @param eit_mod - our module with regex to use. * @param ev - [out] modified event data. */ static void -_eit_scrape_subtitle(eit_module_t *eit_mod, - eit_event_t *ev) +_eit_scrape_text(eit_module_t *eit_mod, eit_event_t *ev) { lang_str_ele_t *se; - lang_str_t *ls; - char buffer1[2048]; - char buffer2[2048]; - char *bufs[2] = { buffer1, buffer2 }; - size_t sizes[2] = { sizeof(buffer1), sizeof(buffer2) }; - - /* Freeview/Freesat have a subtitle as part of the summary in the format - * "subtitle: desc". So try and extract it and use that. - * If we can't find a subtitle then default to previous behaviour of - * setting the summary as the subtitle. + char buffer[2048]; + + /* UK Freeview/Freesat have a subtitle as part of the summary in the format + * "subtitle: desc". They may also have the title continue into the + * summary. So if configured, run scrapers for the title, the subtitle + * and the summary (the latter to tidy up). */ - ls = lang_str_create(); - RB_FOREACH(se, ev->summary, link) { - if (eit_pattern_apply_list_2(bufs, sizes, se->str, &eit_mod->p_scrape_subtitle)) { - tvhtrace(LS_TBL_EIT, " scrape subtitle '%s'/'%s' from '%s' using %s", - buffer1, buffer2, se->str, eit_mod->id); - lang_str_set(&ev->subtitle, buffer1, se->lang); - if (bufs[1]) - lang_str_set(&ls, buffer2, se->lang); + if (ev->title && ev->summary && eit_mod->scrape_title) { + char title_summary[2048]; + lang_str_t *ls = lang_str_create(); + RB_FOREACH(se, ev->title, link) { + snprintf(title_summary, sizeof(title_summary), "%s %s", + se->str, lang_str_get(ev->summary, se->lang)); + if (eit_pattern_apply_list(buffer, sizeof(buffer), title_summary, &eit_mod->p_scrape_title)) { + tvhtrace(LS_TBL_EIT, " scrape title '%s' from '%s' using %s", + buffer, title_summary, eit_mod->id); + lang_str_set(&ls, buffer, se->lang); + } + } + RB_FOREACH(se, ls, link) { + lang_str_set(&ev->title, se->str, se->lang); + } + lang_str_destroy(ls); + } + + if (ev->summary && eit_mod->scrape_subtitle) { + RB_FOREACH(se, ev->summary, link) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_subtitle)) { + tvhtrace(LS_TBL_EIT, " scrape subtitle '%s' from '%s' using %s", + buffer, se->str, eit_mod->id); + lang_str_set(&ev->subtitle, buffer, se->lang); + } } } - RB_FOREACH(se, ls, link) { + + if (ev->summary && eit_mod->scrape_summary) { + lang_str_t *ls = lang_str_create(); + RB_FOREACH(se, ev->summary, link) { + if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_summary)) { + tvhtrace(LS_TBL_EIT, " scrape summary '%s' from '%s' using %s", + buffer, se->str, eit_mod->id); + lang_str_set(&ls, buffer, se->lang); + } + } + RB_FOREACH(se, ls, link) { lang_str_set(&ev->summary, se->str, se->lang); + } + lang_str_destroy(ls); } - lang_str_destroy(ls); } /* ************************************************************************ @@ -802,8 +826,7 @@ static int _eit_process_event _eit_scrape_episode(ev.summary, eit_mod, &ev); } - if (ev.summary && eit_mod->scrape_subtitle) - _eit_scrape_subtitle(eit_mod, &ev); + _eit_scrape_text(eit_mod, &ev); if (lock) pthread_mutex_lock(&global_lock); @@ -1186,7 +1209,9 @@ static void _eit_scrape_clear(eit_module_t *mod) eit_pattern_free_list(&mod->p_snum); eit_pattern_free_list(&mod->p_enum); eit_pattern_free_list(&mod->p_airdate); + eit_pattern_free_list(&mod->p_scrape_title); eit_pattern_free_list(&mod->p_scrape_subtitle); + eit_pattern_free_list(&mod->p_scrape_summary); eit_pattern_free_list(&mod->p_is_new); } @@ -1199,10 +1224,18 @@ static int _eit_scrape_load_one ( htsmsg_t *m, eit_module_t* mod ) eit_pattern_compile_named_list(&mod->p_is_new, m, "is_new"); } + if (mod->scrape_title) { + eit_pattern_compile_named_list(&mod->p_scrape_title, m, "scrape_title"); + } + if (mod->scrape_subtitle) { eit_pattern_compile_named_list(&mod->p_scrape_subtitle, m, "scrape_subtitle"); } + if (mod->scrape_summary) { + eit_pattern_compile_named_list(&mod->p_scrape_summary, m, "scrape_summary"); + } + return 1; } diff --git a/src/epggrab/module/eitpatternlist.c b/src/epggrab/module/eitpatternlist.c index e066c1bb6..46207ef2d 100644 --- a/src/epggrab/module/eitpatternlist.c +++ b/src/epggrab/module/eitpatternlist.c @@ -63,13 +63,6 @@ void eit_pattern_compile_named_list ( eit_pattern_list_t *list, htsmsg_t *m, con eit_pattern_compile_list(list, htsmsg_get_list(m, key), TVHREGEX_POSIX); } -void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l) -{ - char *b[2] = { buf, NULL }; - size_t s[2] = { size_buf, 0 }; - return eit_pattern_apply_list_2(b, s, text, l); -} - static void rtrim(char *buf) { size_t len = strlen(buf); @@ -78,27 +71,29 @@ static void rtrim(char *buf) buf[len] = '\0'; } -void *eit_pattern_apply_list_2(char *buf[2], size_t size_buf[2], const char *text, eit_pattern_list_t *l) +void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l) { eit_pattern_t *p; + char matchbuf[2048]; - assert(buf[0]); + assert(buf); assert(text); if (!l) return NULL; - /* search and report the first match */ + + /* search and concatenate all subgroup matches - there must be at least one */ TAILQ_FOREACH(p, l, p_links) if (!regex_match(&p->compiled, text) && - !regex_match_substring(&p->compiled, 1, buf[0], size_buf[0])) { - rtrim(buf[0]); - if (buf[1] && !regex_match_substring(&p->compiled, 2, buf[1], size_buf[1])) { - rtrim(buf[1]); - tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s' & '%s'", p->text, buf[0], buf[1]); - } else { - buf[1] = NULL; - tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s'", p->text, buf[0]); + !regex_match_substring(&p->compiled, 1, buf, size_buf)) { + for (int matchno = 2; ; ++matchno) { + if (regex_match_substring(&p->compiled, matchno, matchbuf, sizeof(matchbuf))) + break; + size_t len = strlen(buf); + strncat(buf, matchbuf, size_buf - len - 1); } - return buf[0]; + rtrim(buf); + tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s'", p->text, buf); + return buf; } return NULL; } diff --git a/src/epggrab/module/eitpatternlist.h b/src/epggrab/module/eitpatternlist.h index a32d7b59a..dc6a02416 100644 --- a/src/epggrab/module/eitpatternlist.h +++ b/src/epggrab/module/eitpatternlist.h @@ -45,11 +45,5 @@ void eit_pattern_compile_named_list ( eit_pattern_list_t *list, htsmsg_t *m, con * Return the buf or NULL if no match. */ void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l); -/* As eit_pattern_apply_list(), but return up to 2 matches. - * buf[0] & size_buf[0] are the first match, buf[1] & size_buf[1] the second. - * If no second match is found, set buf[1] to NULL. - * Return the first buf or NULL if no match. - */ -void *eit_pattern_apply_list_2(char *buf[2], size_t size_buf[2], const char *text, eit_pattern_list_t *l); void eit_pattern_free_list ( eit_pattern_list_t *l ); #endif