regular expressions is at
http://www.regular-expressions.info/refbasic.html.
+Languages
+---------
+
+By default, regular expressions are applied to input text in any language.
+You can specify that a regular expression is appropriate only for a
+particular language or group of languages by using an expanded definition
+with a "lang" component. This component may be either a string with a single
+language identifier, or a list of language identifier strings. For
+example:
+
+{
+ "scrape_subtitle": [
+ {
+ "pattern": "^[.][.][.][^:.?!]*[.:?!] +(.*)",
+ "lang": "eng"
+ },
+ {
+ "pattern": "^[0-9]+/[0-9]+[.] +(.*)",
+ "lang": ["eng", "fre"]
+ },
+ "^([^:]+): "
+ ]
+}
+
+If the regular expression is marked with a language or group of languages,
+and the input text language does not match one of those specified for
+the regular expression, the regular expression is ignored and processing
+continues with the next regular expression in the list.
+
+Language codes must be 3 character ISO 639-2 B codes as listed in
+src/lang_codes.c.
+
Testing
-------
/* search for season number */
RB_FOREACH(se, str, link) {
- if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_snum))
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_snum))
if ((ev->en.s_num = positive_atoi(buffer))) {
tvhtrace(LS_TBL_EIT," extract season number %d using %s", ev->en.s_num, eit_mod->id);
break;
/* ...for episode number */
RB_FOREACH(se, str, link) {
- if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_enum))
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_enum))
if ((ev->en.e_num = positive_atoi(buffer))) {
tvhtrace(LS_TBL_EIT," extract episode number %d using %s", ev->en.e_num, eit_mod->id);
break;
/* Extract original air date year */
RB_FOREACH(se, str, link) {
- if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_airdate)) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_airdate)) {
if (strlen(buffer) == 4) {
/* Year component only, so assume it is the copyright year. */
ev->copyright_year = positive_atoi(buffer);
/* Extract is_new flag. Any match is assumed to mean "new" */
RB_FOREACH(se, str, link) {
- if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_is_new)) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_is_new)) {
ev->is_new = 1;
break;
}
RB_FOREACH(se, ev->title, link) {
snprintf(title_summary, sizeof(title_summary), "%s %s",
se->str, lang_str_get(ev->summary, se->lang));
- if (eit_pattern_apply_list(buffer, sizeof(buffer), title_summary, &eit_mod->p_scrape_title)) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), title_summary, se->lang, &eit_mod->p_scrape_title)) {
tvhtrace(LS_TBL_EIT, " scrape title '%s' from '%s' using %s",
buffer, title_summary, eit_mod->id);
lang_str_set(&ls, buffer, se->lang);
if (eit_mod->scrape_subtitle) {
RB_FOREACH(se, ev->summary, link) {
- if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_subtitle)) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_scrape_subtitle)) {
tvhtrace(LS_TBL_EIT, " scrape subtitle '%s' from '%s' using %s",
buffer, se->str, eit_mod->id);
lang_str_set(&ev->subtitle, buffer, se->lang);
if (eit_mod->scrape_summary) {
lang_str_t *ls = lang_str_create();
RB_FOREACH(se, ev->summary, link) {
- if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, &eit_mod->p_scrape_summary)) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), se->str, se->lang, &eit_mod->p_scrape_summary)) {
tvhtrace(LS_TBL_EIT, " scrape summary '%s' from '%s' using %s",
buffer, se->str, eit_mod->id);
lang_str_set(&ls, buffer, se->lang);
#define MAX_TEXT_LEN 2048
+static char *get_languages_string(htsmsg_field_t *field)
+{
+ const char *s;
+ htsmsg_t *langlist;
+
+ if (field == NULL)
+ return NULL;
+
+ s = htsmsg_field_get_str(field);
+ if (s) {
+ return strdup(s);
+ } else {
+ langlist = htsmsg_field_get_list(field);
+ if (langlist) {
+ htsmsg_field_t *item;
+ char langbuf[MAX_TEXT_LEN];
+ langbuf[0] = '\0';
+ HTSMSG_FOREACH(item, langlist) {
+ s = htsmsg_field_get_str(item);
+ if (s) {
+ strncat(langbuf, s, sizeof(langbuf) - strlen(langbuf) - 1);
+ strncat(langbuf, "|", sizeof(langbuf) - strlen(langbuf) - 1);
+ }
+ }
+ if (strlen(langbuf) > 0)
+ return strdup(langbuf);
+ }
+ }
+ return NULL;
+}
+
void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l, int flags )
{
eit_pattern_t *pattern;
htsmsg_field_t *f;
- const char *s;
+ const char *text;
int filter;
+ char *langs;
TAILQ_INIT(list);
if (!l) return;
HTSMSG_FOREACH(f, l) {
- s = htsmsg_field_get_str(f);
+ text = htsmsg_field_get_str(f);
filter = 0;
- if (s == NULL) {
+ langs = NULL;
+ if (text == NULL) {
htsmsg_t *m = htsmsg_field_get_map(f);
if (m == NULL) continue;
- s = htsmsg_get_str(m, "pattern");
- if (s == NULL) continue;
+ text = htsmsg_get_str(m, "pattern");
+ if (text == NULL) continue;
filter = htsmsg_get_bool_or_default(m, "filter", 0);
+ langs = get_languages_string(htsmsg_field_find(m, "lang"));
}
pattern = calloc(1, sizeof(eit_pattern_t));
- pattern->text = strdup(s);
+ pattern->text = strdup(text);
pattern->filter = filter;
+ pattern->langs = langs;
if (regex_compile(&pattern->compiled, pattern->text, flags, LS_EPGGRAB)) {
tvhwarn(LS_EPGGRAB, "error compiling pattern \"%s\"", pattern->text);
+ free(pattern->langs);
free(pattern->text);
free(pattern);
} else {
buf[len] = '\0';
}
-void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l)
+void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, const char *lang, eit_pattern_list_t *l)
{
eit_pattern_t *p;
char textbuf[MAX_TEXT_LEN];
if (!l) return NULL;
/* search and concatenate all subgroup matches - there must be at least one */
- TAILQ_FOREACH(p, l, p_links)
+ TAILQ_FOREACH(p, l, p_links) {
+ if (p->langs && lang) {
+ if (strstr(p->langs, lang) == NULL) {
+ continue;
+ }
+ }
if (!regex_match(&p->compiled, text) &&
!regex_match_substring(&p->compiled, 1, buf, size_buf)) {
for (matchno = 2; ; ++matchno) {
}
return buf;
}
+ }
return NULL;
}
if (!l) return;
while ((p = TAILQ_FIRST(l)) != NULL) {
TAILQ_REMOVE(l, p, p_links);
+ free(p->langs);
free(p->text);
regex_free(&p->compiled);
free(p);
char *text;
tvh_regex_t compiled;
int filter;
+ char *langs;
TAILQ_ENTRY(eit_pattern) p_links;
} eit_pattern_t;
* match in buf which is of size size_buf.
* Return the buf or NULL if no match.
*/
-void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l);
+void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, const char *lang, eit_pattern_list_t *l);
void eit_pattern_free_list ( eit_pattern_list_t *l );
#endif
tvhdebug(LS_OPENTV, " title '%s'", ev.title);
/* try to cleanup the title */
- if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.title, &mod->p_cleanup_title)) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.title, lang, &mod->p_cleanup_title)) {
tvhtrace(LS_OPENTV, " clean title '%s'", buffer);
s = buffer;
} else {
memset(&en, 0, sizeof(en));
/* search for season number */
- if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, &mod->p_snum))
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, lang, &mod->p_snum))
if ((en.s_num = atoi(buffer)))
tvhtrace(LS_OPENTV," extract season number %d", en.s_num);
/* ...for episode number */
- if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, &mod->p_enum))
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, lang, &mod->p_enum))
if ((en.e_num = atoi(buffer)))
tvhtrace(LS_OPENTV," extract episode number %d", en.e_num);
/* ...for part number */
- if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, &mod->p_pnum)) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, lang, &mod->p_pnum)) {
if (buffer[0] >= 'a' && buffer[0] <= 'z')
en.p_num = buffer[0] - 'a' + 1;
else
save |= epg_episode_set_epnum(ee, &en, &changes3);
/* ...for subtitle */
- if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, &mod->p_subt)) {
+ if (eit_pattern_apply_list(buffer, sizeof(buffer), ev.summary, lang, &mod->p_subt)) {
tvhtrace(LS_OPENTV, " extract subtitle '%s'", buffer);
ls = lang_str_create2(buffer, lang);
save |= epg_episode_set_subtitle(ee, ls, &changes3);
self.engine = engine
if isinstance(regex, dict):
self.regex = regex["pattern"]
- self.re_is_filter = (regex["filter"] != 0)
+ try:
+ self.re_is_filter = (regex["filter"] != 0)
+ except KeyError:
+ self.re_is_filter = False
+ try:
+ if isinstance(regex["lang"], str):
+ self.lang = [ regex["lang"] ]
+ else:
+ self.lang = regex["lang"]
+ except KeyError:
+ self.lang = None
else:
self.regex = regex
self.re_is_filter = False
+ self.lang = None
flags = re_base_flag
if not engine:
flags |= re_posix_flag
self.num_failed = 0;
self.num_ok = 0;
- def run_test_case_i(self, text, regexes, expect, testing):
+ def run_test_case_i(self, text, lang, regexes, expect, testing):
"""Run a test case for text using the regular expression lists in reg,
expecting the result of a match to be expect while running a test
case for the string testing."""
for regex in regexes:
+ if lang and regex.lang and lang not in regex.lang:
+ continue
result = regex.search(text)
if result is not None:
if regex.re_is_filter:
canonical, _, for_engine = key.partition(':')
if for_engine and for_engine != engine:
continue
- if canonical in ('comment', 'summary', 'title'):
+ if canonical in ('comment', 'summary', 'title', 'language'):
continue
if canonical in ('age', 'genre'):
print 'Test case contains key "{key}" which is not currently tested for "{test}"'.format(key=key, test=test)
text = test['summary']
if canonical == 'new_title':
text = test['title'] + ' ' + text
+ if 'language' in test:
+ lang = test['language']
+ else:
+ lang = None
if regexes[canonical]:
- self.run_test_case_i(text, regexes[canonical], test[key], key)
+ self.run_test_case_i(text, lang, regexes[canonical], test[key], key)
else:
print 'FAIL: no regex defined for key "{key}"'.format(key=canonical)
self.num_failed = self.num_failed + 1
- "summary" - summary field from EIT broadcast that will be scraped.
+- "language" - optional 3 character ISO 639-2 B language code
+ string specifying the title and summary language.
Expected Result: