eit: add title and summary scrapers (#4801)

author Jim Hague <jim.hague@acm.org>

Sun, 17 Dec 2017 00:48:23 +0000 (00:48 +0000)

committer Jaroslav Kysela <perex@perex.cz>

Thu, 21 Dec 2017 17:01:20 +0000 (18:01 +0100)
author Jim Hague <jim.hague@acm.org>
Sun, 17 Dec 2017 00:48:23 +0000 (00:48 +0000)
committer Jaroslav Kysela <perex@perex.cz>
Thu, 21 Dec 2017 17:01:20 +0000 (18:01 +0100)
diff --git a/data/conf/epggrab/eit/scrape/README b/data/conf/epggrab/eit/scrape/README

index 51972c4d65b38389322f30154734d23637e3ab6b..41ad82c3d0b16e04b43c189dc39307366e287253 100644 (file)
--- a/data/conf/epggrab/eit/scrape/README
+++ b/data/conf/epggrab/eit/scrape/README
@@ -11,32 +11,40 @@ object are:
  * episode_num
  * airdate
  * is_new
+* scrape_title
  * scrape_subtitle
+* scrape_summary
  
  Each member's value is a list of regular expressions. Each regular
  expression must contain at least one sub-pattern, i.e. a pattern
  enclosed in (). Input data is matched against the first regex in the
  list. If no match is found, the second regex is tried, and so on until
-a match is found or the list exhausted.
+a match is found or the list exhausted. If a match is found, the result
+of the match is the contents of all the sub-patterns in the regular
+expression concatenated together.
  
  For each EPG episode, the title, description and summary are matched
  in turn against the season_num, episode_num, airdate and is_new regexes.
  
-- season_num converts the contents of the first sub-pattern to an integer,
+- season_num converts the contents of the match result to an integer,
    and if successful sets the EPG season number.
-- episode_num converts the contents of the first sub-pattern to an integer,
+- episode_num converts the contents of the match result to an integer,
    and if successful sets the EPG eipsode number.
-- airdate converts the contents of the first sub-pattern to an integer,
+- airdate converts the contents of the match result to an integer,
    and if successful sets the EPG copyright year.
  - is_new sets the EPG is_new flag on any match. Remember the regexp must
-  have one sub-pattern to make a successful match; in this case the content
-  of the sub-pattern is ignored.
-
-Finally, the summary only is matched against the scrape_subtitle regexs.
-On an match, the EPG subtitle is set to the contents of the first sub-pattern.
-If a second sub-pattern is present in the regex, the EPG summary is set to
-the contents of that sub-pattern. If no second sub-pattern is present, the
-EPG summary is not changed.
+  have at least one sub-pattern to make a successful match; in this case
+  the match result is ignored.
+
+Next, a combined title/summary text is made by joining the title, a space,
+and the summary. The combined text is matched against the scrape_title regex.
+On a match, the EPG title is set to the match result.
+
+Then the summary is matched against the scrape_subtitle regex. On a match,
+the EPG subtitle is set to the match result.
+
+Finally, the summary is matched against the scrape_summary regex. On a match,
+the EPG summary is set to the match result.
  
  Regular expression engine
  -------------------------
diff --git a/src/epggrab.h b/src/epggrab.h

index a7e6a0c5084908e5d77ad5dadc2d16782e8ecd6e..9c76624eaf833f37882a22b94239afb16d1b79a7 100644 (file)
--- a/src/epggrab.h
+++ b/src/epggrab.h
@@ -290,7 +290,9 @@ struct epggrab_module_ota_scraper
    epggrab_module_ota_t             ;      ///< Parent object
    char                   *scrape_config;  ///< Config to use or blank/NULL for default.
    int                     scrape_episode; ///< Scrape season/episode from EIT summary
+  int                     scrape_title;   ///< Scrape title from EIT title + summary
    int                     scrape_subtitle;///< Scrape subtitle from EIT summary
+  int                     scrape_summary; ///< Scrape summary from EIT summary
  };
  
  /*
diff --git a/src/epggrab/module.c b/src/epggrab/module.c

index 5808b0d5613ca1b2375ca8c4c6aacf1a08ace7a9..9e502138099178c2e599370ba37ee59efe4d5b1f 100644 (file)
--- a/src/epggrab/module.c
+++ b/src/epggrab/module.c
@@ -299,6 +299,19 @@ const idclass_t epggrab_mod_ota_scraper_class = {
        .off    = offsetof(epggrab_module_ota_scraper_t, scrape_episode),
        .group  = 2,
      },
+    {
+      .type   = PT_BOOL,
+      .id     = "scrape_title",
+      .name   = N_("Scrape Title"),
+      .desc   = N_("Enable/disable scraping title from the programme title and description. "
+                   "Some broadcasters can split the title over the separate title, "
+                   "and summary fields. This allows scraping of common split title formats "
+                   "from within the broadcast title and summary field if supported by the "
+                   "configuration file."
+                   ),
+      .off    = offsetof(epggrab_module_ota_scraper_t, scrape_title),
+      .group  = 2,
+    },
      {
        .type   = PT_BOOL,
        .id     = "scrape_subtitle",
@@ -312,6 +325,19 @@ const idclass_t epggrab_mod_ota_scraper_class = {
        .off    = offsetof(epggrab_module_ota_scraper_t, scrape_subtitle),
        .group  = 2,
      },
+    {
+      .type   = PT_BOOL,
+      .id     = "scrape_summary",
+      .name   = N_("Scrape Summary"),
+      .desc   = N_("Enable/disable scraping summary from the programme description. "
+                   "Some broadcasters do not send separate title, subtitle, description, "
+                   "and summary fields. This allows scraping of a modified summary "
+                   "from within the broadcast summary field if supported by the "
+                   "configuration file."
+                   ),
+      .off    = offsetof(epggrab_module_ota_scraper_t, scrape_summary),
+      .group  = 2,
+    },
      {}
    }
  };
diff --git a/src/epggrab/module/eit.c b/src/epggrab/module/eit.c

index d343c2f22faf723bca1b08dd9b826f59c1c21f83..88bab322f3755a01e7076ffe6a582f7332a78b86 100644 (file)
--- a/src/epggrab/module/eit.c
+++ b/src/epggrab/module/eit.c
@@ -67,7 +67,9 @@ typedef struct eit_module_t
    eit_pattern_list_t p_snum;
    eit_pattern_list_t p_enum;
    eit_pattern_list_t p_airdate;        ///< Original air date parser
+  eit_pattern_list_t p_scrape_title;   ///< Scrape title from title + summary data
    eit_pattern_list_t p_scrape_subtitle;///< Scrape subtitle from summary data
+  eit_pattern_list_t p_scrape_summary; ///< Scrape summary from summary data
    eit_pattern_list_t p_is_new;         ///< Is programme new to air
  } eit_module_t;
  
@@ -513,41 +515,63 @@ _eit_scrape_episode(lang_str_t *str,
    }
  }
  
-/* Scrape subtitle data from the broadcast data.
- * @param text - string from broadcaster to search for all languages.
+/* Scrape title/subtitle/summary data from the broadcast data.
   * @param eit_mod - our module with regex to use.
   * @param ev - [out] modified event data.
   */
  static void
-_eit_scrape_subtitle(eit_module_t *eit_mod,
-                     eit_event_t *ev)
+_eit_scrape_text(eit_module_t *eit_mod, eit_event_t *ev)
  {
    lang_str_ele_t *se;
-  lang_str_t *ls;
-  char buffer1[2048];
-  char buffer2[2048];
-  char *bufs[2] = { buffer1, buffer2 };
-  size_t sizes[2] = { sizeof(buffer1), sizeof(buffer2) };
-
-  /* Freeview/Freesat have a subtitle as part of the summary in the format
-   * "subtitle: desc". So try and extract it and use that.
-   * If we can't find a subtitle then default to previous behaviour of
-   * setting the summary as the subtitle.
+  char buffer[2048];
+
+  /* UK Freeview/Freesat have a subtitle as part of the summary in the format
+   * "subtitle: desc". They may also have the title continue into the
+   * summary. So if configured, run scrapers for the title, the subtitle
+   * and the summary (the latter to tidy up).
     */
-  ls = lang_str_create();
-  RB_FOREACH(se, ev->summary, link) {
-    if (eit_pattern_apply_list_2(bufs, sizes, se->str, &eit_mod->p_scrape_subtitle)) {
-      tvhtrace(LS_TBL_EIT, "  scrape subtitle '%s'/'%s' from '%s' using %s",
-               buffer1, buffer2, se->str, eit_mod->id);
-      lang_str_set(&ev->subtitle, buffer1, se->lang);
-      if (bufs[1])
-        lang_str_set(&ls, buffer2, se->lang);
+  if (ev->title && ev->summary && eit_mod->scrape_title) {
+    char title_summary[2048];
+    lang_str_t *ls = lang_str_create();
+    RB_FOREACH(se, ev->title, link) {
+      snprintf(title_summary, sizeof(title_summary), "%s %s",
+               se->str, lang_str_get(ev->summary, se->lang));
+      if (eit_pattern_apply_list(buffer, sizeof(buffer), title_summary, &eit_mod->p_scrape_title)) {
+        tvhtrace(LS_TBL_EIT, "  scrape title '%s' from '%s' using %s",
+                 buffer, title_summary, eit_mod->id);
+        lang_str_set(&ls, buffer, se->lang);
+      }
+    }
+    RB_FOREACH(se, ls, link) {
+      lang_str_set(&ev->title, se->str, se->lang);
+    }
+    lang_str_destroy(ls);
+  }
+
+  if (ev->summary && eit_mod->scrape_subtitle) {
+    RB_FOREACH(se, ev->summary, link) {
+      if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_subtitle)) {
+        tvhtrace(LS_TBL_EIT, "  scrape subtitle '%s' from '%s' using %s",
+                 buffer, se->str, eit_mod->id);
+        lang_str_set(&ev->subtitle, buffer, se->lang);
+      }
      }
    }
-  RB_FOREACH(se, ls, link) {
+
+  if (ev->summary && eit_mod->scrape_summary) {
+    lang_str_t *ls = lang_str_create();
+    RB_FOREACH(se, ev->summary, link) {
+      if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_summary)) {
+        tvhtrace(LS_TBL_EIT, "  scrape summary '%s' from '%s' using %s",
+                 buffer, se->str, eit_mod->id);
+        lang_str_set(&ls, buffer, se->lang);
+      }
+    }
+    RB_FOREACH(se, ls, link) {
        lang_str_set(&ev->summary, se->str, se->lang);
+    }
+    lang_str_destroy(ls);
    }
-  lang_str_destroy(ls);
  }
  
  /* ************************************************************************
@@ -802,8 +826,7 @@ static int _eit_process_event
        _eit_scrape_episode(ev.summary, eit_mod, &ev);
    }
  
-  if (ev.summary && eit_mod->scrape_subtitle)
-    _eit_scrape_subtitle(eit_mod, &ev);
+  _eit_scrape_text(eit_mod, &ev);
  
    if (lock)
      pthread_mutex_lock(&global_lock);
@@ -1186,7 +1209,9 @@ static void _eit_scrape_clear(eit_module_t *mod)
    eit_pattern_free_list(&mod->p_snum);
    eit_pattern_free_list(&mod->p_enum);
    eit_pattern_free_list(&mod->p_airdate);
+  eit_pattern_free_list(&mod->p_scrape_title);
    eit_pattern_free_list(&mod->p_scrape_subtitle);
+  eit_pattern_free_list(&mod->p_scrape_summary);
    eit_pattern_free_list(&mod->p_is_new);
  }
  
@@ -1199,10 +1224,18 @@ static int _eit_scrape_load_one ( htsmsg_t *m, eit_module_t* mod )
      eit_pattern_compile_named_list(&mod->p_is_new, m, "is_new");
    }
  
+  if (mod->scrape_title) {
+    eit_pattern_compile_named_list(&mod->p_scrape_title, m, "scrape_title");
+  }
+
    if (mod->scrape_subtitle) {
      eit_pattern_compile_named_list(&mod->p_scrape_subtitle, m, "scrape_subtitle");
    }
  
+  if (mod->scrape_summary) {
+    eit_pattern_compile_named_list(&mod->p_scrape_summary, m, "scrape_summary");
+  }
+
    return 1;
  }
  
diff --git a/src/epggrab/module/eitpatternlist.c b/src/epggrab/module/eitpatternlist.c

index e066c1bb675a9f5c0670c3fc64776df079d18ffb..46207ef2d2ee5c4a9b94fb87fd10059d43749373 100644 (file)
--- a/src/epggrab/module/eitpatternlist.c
+++ b/src/epggrab/module/eitpatternlist.c
@@ -63,13 +63,6 @@ void eit_pattern_compile_named_list ( eit_pattern_list_t *list, htsmsg_t *m, con
    eit_pattern_compile_list(list, htsmsg_get_list(m, key), TVHREGEX_POSIX);
  }
  
-void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l)
-{
-  char *b[2] = { buf, NULL };
-  size_t s[2] = { size_buf, 0 };
-  return eit_pattern_apply_list_2(b, s, text, l);
-}
-
  static void rtrim(char *buf)
  {
    size_t len = strlen(buf);
@@ -78,27 +71,29 @@ static void rtrim(char *buf)
    buf[len] = '\0';
  }
  
-void *eit_pattern_apply_list_2(char *buf[2], size_t size_buf[2], const char *text, eit_pattern_list_t *l)
+void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l)
  {
    eit_pattern_t *p;
+  char matchbuf[2048];
  
-  assert(buf[0]);
+  assert(buf);
    assert(text);
  
    if (!l) return NULL;
-  /* search and report the first match */
+
+  /* search and concatenate all subgroup matches - there must be at least one */
    TAILQ_FOREACH(p, l, p_links)
      if (!regex_match(&p->compiled, text) &&
-        !regex_match_substring(&p->compiled, 1, buf[0], size_buf[0])) {
-      rtrim(buf[0]);
-      if (buf[1] && !regex_match_substring(&p->compiled, 2, buf[1], size_buf[1])) {
-        rtrim(buf[1]);
-        tvhtrace(LS_EPGGRAB,"  pattern \"%s\" matches with '%s' & '%s'", p->text, buf[0], buf[1]);
-      } else {
-        buf[1] = NULL;
-        tvhtrace(LS_EPGGRAB,"  pattern \"%s\" matches with '%s'", p->text, buf[0]);
+        !regex_match_substring(&p->compiled, 1, buf, size_buf)) {
+      for (int matchno = 2; ; ++matchno) {
+        if (regex_match_substring(&p->compiled, matchno, matchbuf, sizeof(matchbuf)))
+          break;
+        size_t len = strlen(buf);
+        strncat(buf, matchbuf, size_buf - len - 1);
        }
-      return buf[0];
+      rtrim(buf);
+      tvhtrace(LS_EPGGRAB,"  pattern \"%s\" matches with '%s'", p->text, buf);
+      return buf;
      }
    return NULL;
  }
diff --git a/src/epggrab/module/eitpatternlist.h b/src/epggrab/module/eitpatternlist.h

index a32d7b59addf9dae8820d85d10080e23c68923c1..dc6a02416df60bad37813f17267756fde59e1c39 100644 (file)
--- a/src/epggrab/module/eitpatternlist.h
+++ b/src/epggrab/module/eitpatternlist.h
@@ -45,11 +45,5 @@ void eit_pattern_compile_named_list ( eit_pattern_list_t *list, htsmsg_t *m, con
   * Return the buf or NULL if no match.
   */
  void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l);
-/* As eit_pattern_apply_list(), but return up to 2 matches.
- * buf[0] & size_buf[0] are the first match, buf[1] & size_buf[1] the second.
- * If no second match is found, set buf[1] to NULL.
- * Return the first buf or NULL if no match.
- */
-void *eit_pattern_apply_list_2(char *buf[2], size_t size_buf[2], const char *text, eit_pattern_list_t *l);
  void eit_pattern_free_list ( eit_pattern_list_t *l );
  #endif
author	Jim Hague <jim.hague@acm.org>
	Sun, 17 Dec 2017 00:48:23 +0000 (00:48 +0000)
committer	Jaroslav Kysela <perex@perex.cz>
	Thu, 21 Dec 2017 17:01:20 +0000 (18:01 +0100)
data/conf/epggrab/eit/scrape/README		patch \| blob \| blame \| history
src/epggrab.h		patch \| blob \| blame \| history
src/epggrab/module.c		patch \| blob \| blame \| history
src/epggrab/module/eit.c		patch \| blob \| blame \| history
src/epggrab/module/eitpatternlist.c		patch \| blob \| blame \| history
src/epggrab/module/eitpatternlist.h		patch \| blob \| blame \| history