* episode_num
* airdate
* is_new
+* scrape_title
* scrape_subtitle
+* scrape_summary
Each member's value is a list of regular expressions. Each regular
expression must contain at least one sub-pattern, i.e. a pattern
enclosed in (). Input data is matched against the first regex in the
list. If no match is found, the second regex is tried, and so on until
-a match is found or the list exhausted.
+a match is found or the list exhausted. If a match is found, the result
+of the match is the contents of all the sub-patterns in the regular
+expression concatenated together.
For each EPG episode, the title, description and summary are matched
in turn against the season_num, episode_num, airdate and is_new regexes.
-- season_num converts the contents of the first sub-pattern to an integer,
+- season_num converts the contents of the match result to an integer,
and if successful sets the EPG season number.
-- episode_num converts the contents of the first sub-pattern to an integer,
+- episode_num converts the contents of the match result to an integer,
and if successful sets the EPG eipsode number.
-- airdate converts the contents of the first sub-pattern to an integer,
+- airdate converts the contents of the match result to an integer,
and if successful sets the EPG copyright year.
- is_new sets the EPG is_new flag on any match. Remember the regexp must
- have one sub-pattern to make a successful match; in this case the content
- of the sub-pattern is ignored.
-
-Finally, the summary only is matched against the scrape_subtitle regexs.
-On an match, the EPG subtitle is set to the contents of the first sub-pattern.
-If a second sub-pattern is present in the regex, the EPG summary is set to
-the contents of that sub-pattern. If no second sub-pattern is present, the
-EPG summary is not changed.
+ have at least one sub-pattern to make a successful match; in this case
+ the match result is ignored.
+
+Next, a combined title/summary text is made by joining the title, a space,
+and the summary. The combined text is matched against the scrape_title regex.
+On a match, the EPG title is set to the match result.
+
+Then the summary is matched against the scrape_subtitle regex. On a match,
+the EPG subtitle is set to the match result.
+
+Finally, the summary is matched against the scrape_summary regex. On a match,
+the EPG summary is set to the match result.
Regular expression engine
-------------------------
epggrab_module_ota_t ; ///< Parent object
char *scrape_config; ///< Config to use or blank/NULL for default.
int scrape_episode; ///< Scrape season/episode from EIT summary
+ int scrape_title; ///< Scrape title from EIT title + summary
int scrape_subtitle;///< Scrape subtitle from EIT summary
+ int scrape_summary; ///< Scrape summary from EIT summary
};
/*
.off = offsetof(epggrab_module_ota_scraper_t, scrape_episode),
.group = 2,
},
+ {
+ .type = PT_BOOL,
+ .id = "scrape_title",
+ .name = N_("Scrape Title"),
+ .desc = N_("Enable/disable scraping title from the programme title and description. "
+ "Some broadcasters can split the title over the separate title, "
+ "and summary fields. This allows scraping of common split title formats "
+ "from within the broadcast title and summary field if supported by the "
+ "configuration file."
+ ),
+ .off = offsetof(epggrab_module_ota_scraper_t, scrape_title),
+ .group = 2,
+ },
{
.type = PT_BOOL,
.id = "scrape_subtitle",
.off = offsetof(epggrab_module_ota_scraper_t, scrape_subtitle),
.group = 2,
},
+ {
+ .type = PT_BOOL,
+ .id = "scrape_summary",
+ .name = N_("Scrape Summary"),
+ .desc = N_("Enable/disable scraping summary from the programme description. "
+ "Some broadcasters do not send separate title, subtitle, description, "
+ "and summary fields. This allows scraping of a modified summary "
+ "from within the broadcast summary field if supported by the "
+ "configuration file."
+ ),
+ .off = offsetof(epggrab_module_ota_scraper_t, scrape_summary),
+ .group = 2,
+ },
{}
}
};
eit_pattern_list_t p_snum;
eit_pattern_list_t p_enum;
eit_pattern_list_t p_airdate; ///< Original air date parser
+ eit_pattern_list_t p_scrape_title; ///< Scrape title from title + summary data
eit_pattern_list_t p_scrape_subtitle;///< Scrape subtitle from summary data
+ eit_pattern_list_t p_scrape_summary; ///< Scrape summary from summary data
eit_pattern_list_t p_is_new; ///< Is programme new to air
} eit_module_t;
}
}
-/* Scrape subtitle data from the broadcast data.
- * @param text - string from broadcaster to search for all languages.
+/* Scrape title/subtitle/summary data from the broadcast data.
* @param eit_mod - our module with regex to use.
* @param ev - [out] modified event data.
*/
static void
-_eit_scrape_subtitle(eit_module_t *eit_mod,
- eit_event_t *ev)
+_eit_scrape_text(eit_module_t *eit_mod, eit_event_t *ev)
{
lang_str_ele_t *se;
- lang_str_t *ls;
- char buffer1[2048];
- char buffer2[2048];
- char *bufs[2] = { buffer1, buffer2 };
- size_t sizes[2] = { sizeof(buffer1), sizeof(buffer2) };
-
- /* Freeview/Freesat have a subtitle as part of the summary in the format
- * "subtitle: desc". So try and extract it and use that.
- * If we can't find a subtitle then default to previous behaviour of
- * setting the summary as the subtitle.
+ char buffer[2048];
+
+ /* UK Freeview/Freesat have a subtitle as part of the summary in the format
+ * "subtitle: desc". They may also have the title continue into the
+ * summary. So if configured, run scrapers for the title, the subtitle
+ * and the summary (the latter to tidy up).
*/
- ls = lang_str_create();
- RB_FOREACH(se, ev->summary, link) {
- if (eit_pattern_apply_list_2(bufs, sizes, se->str, &eit_mod->p_scrape_subtitle)) {
- tvhtrace(LS_TBL_EIT, " scrape subtitle '%s'/'%s' from '%s' using %s",
- buffer1, buffer2, se->str, eit_mod->id);
- lang_str_set(&ev->subtitle, buffer1, se->lang);
- if (bufs[1])
- lang_str_set(&ls, buffer2, se->lang);
+ if (ev->title && ev->summary && eit_mod->scrape_title) {
+ char title_summary[2048];
+ lang_str_t *ls = lang_str_create();
+ RB_FOREACH(se, ev->title, link) {
+ snprintf(title_summary, sizeof(title_summary), "%s %s",
+ se->str, lang_str_get(ev->summary, se->lang));
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), title_summary, &eit_mod->p_scrape_title)) {
+ tvhtrace(LS_TBL_EIT, " scrape title '%s' from '%s' using %s",
+ buffer, title_summary, eit_mod->id);
+ lang_str_set(&ls, buffer, se->lang);
+ }
+ }
+ RB_FOREACH(se, ls, link) {
+ lang_str_set(&ev->title, se->str, se->lang);
+ }
+ lang_str_destroy(ls);
+ }
+
+ if (ev->summary && eit_mod->scrape_subtitle) {
+ RB_FOREACH(se, ev->summary, link) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_subtitle)) {
+ tvhtrace(LS_TBL_EIT, " scrape subtitle '%s' from '%s' using %s",
+ buffer, se->str, eit_mod->id);
+ lang_str_set(&ev->subtitle, buffer, se->lang);
+ }
}
}
- RB_FOREACH(se, ls, link) {
+
+ if (ev->summary && eit_mod->scrape_summary) {
+ lang_str_t *ls = lang_str_create();
+ RB_FOREACH(se, ev->summary, link) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_summary)) {
+ tvhtrace(LS_TBL_EIT, " scrape summary '%s' from '%s' using %s",
+ buffer, se->str, eit_mod->id);
+ lang_str_set(&ls, buffer, se->lang);
+ }
+ }
+ RB_FOREACH(se, ls, link) {
lang_str_set(&ev->summary, se->str, se->lang);
+ }
+ lang_str_destroy(ls);
}
- lang_str_destroy(ls);
}
/* ************************************************************************
_eit_scrape_episode(ev.summary, eit_mod, &ev);
}
- if (ev.summary && eit_mod->scrape_subtitle)
- _eit_scrape_subtitle(eit_mod, &ev);
+ _eit_scrape_text(eit_mod, &ev);
if (lock)
pthread_mutex_lock(&global_lock);
eit_pattern_free_list(&mod->p_snum);
eit_pattern_free_list(&mod->p_enum);
eit_pattern_free_list(&mod->p_airdate);
+ eit_pattern_free_list(&mod->p_scrape_title);
eit_pattern_free_list(&mod->p_scrape_subtitle);
+ eit_pattern_free_list(&mod->p_scrape_summary);
eit_pattern_free_list(&mod->p_is_new);
}
eit_pattern_compile_named_list(&mod->p_is_new, m, "is_new");
}
+ if (mod->scrape_title) {
+ eit_pattern_compile_named_list(&mod->p_scrape_title, m, "scrape_title");
+ }
+
if (mod->scrape_subtitle) {
eit_pattern_compile_named_list(&mod->p_scrape_subtitle, m, "scrape_subtitle");
}
+ if (mod->scrape_summary) {
+ eit_pattern_compile_named_list(&mod->p_scrape_summary, m, "scrape_summary");
+ }
+
return 1;
}
eit_pattern_compile_list(list, htsmsg_get_list(m, key), TVHREGEX_POSIX);
}
-void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l)
-{
- char *b[2] = { buf, NULL };
- size_t s[2] = { size_buf, 0 };
- return eit_pattern_apply_list_2(b, s, text, l);
-}
-
static void rtrim(char *buf)
{
size_t len = strlen(buf);
buf[len] = '\0';
}
-void *eit_pattern_apply_list_2(char *buf[2], size_t size_buf[2], const char *text, eit_pattern_list_t *l)
+void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l)
{
eit_pattern_t *p;
+ char matchbuf[2048];
- assert(buf[0]);
+ assert(buf);
assert(text);
if (!l) return NULL;
- /* search and report the first match */
+
+ /* search and concatenate all subgroup matches - there must be at least one */
TAILQ_FOREACH(p, l, p_links)
if (!regex_match(&p->compiled, text) &&
- !regex_match_substring(&p->compiled, 1, buf[0], size_buf[0])) {
- rtrim(buf[0]);
- if (buf[1] && !regex_match_substring(&p->compiled, 2, buf[1], size_buf[1])) {
- rtrim(buf[1]);
- tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s' & '%s'", p->text, buf[0], buf[1]);
- } else {
- buf[1] = NULL;
- tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s'", p->text, buf[0]);
+ !regex_match_substring(&p->compiled, 1, buf, size_buf)) {
+ for (int matchno = 2; ; ++matchno) {
+ if (regex_match_substring(&p->compiled, matchno, matchbuf, sizeof(matchbuf)))
+ break;
+ size_t len = strlen(buf);
+ strncat(buf, matchbuf, size_buf - len - 1);
}
- return buf[0];
+ rtrim(buf);
+ tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s'", p->text, buf);
+ return buf;
}
return NULL;
}
* Return the buf or NULL if no match.
*/
void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l);
-/* As eit_pattern_apply_list(), but return up to 2 matches.
- * buf[0] & size_buf[0] are the first match, buf[1] & size_buf[1] the second.
- * If no second match is found, set buf[1] to NULL.
- * Return the first buf or NULL if no match.
- */
-void *eit_pattern_apply_list_2(char *buf[2], size_t size_buf[2], const char *text, eit_pattern_list_t *l);
void eit_pattern_free_list ( eit_pattern_list_t *l );
#endif