expression concatenated together.
For each EPG episode, the title, description and summary are matched
-in turn against the season_num, episode_num, airdate and is_new regexes.
+in turn against the season_num, episode_num, airdate and is_new regex lists.
- season_num converts the contents of the match result to an integer,
and if successful sets the EPG season number.
the match result is ignored.
Next, a combined title/summary text is made by joining the title, a space,
-and the summary. The combined text is matched against the scrape_title regex.
-On a match, the EPG title is set to the match result.
+and the summary. The combined text is matched against the scrape_title regex
+list. On a match, the EPG title is set to the match result.
-Then the summary is matched against the scrape_subtitle regex. On a match,
+Then the summary is matched against the scrape_subtitle regex list. On a match,
the EPG subtitle is set to the match result.
-Finally, the summary is matched against the scrape_summary regex. On a match,
-the EPG summary is set to the match result.
+Finally, the summary is matched against the scrape_summary regex list. On a
+match, the EPG summary is set to the match result.
+
+Filtering regular expressions
+-----------------------------
+
+Any regular expression in a list can be marked as a filtering regular
+expression. If the regular expression is marked as a filter, and it matches
+the input text, then the match result is not returned as a result, but
+instead replaces the original text to match, and matching continues with the
+next regular expression in the list. If a filter regular expression does
+not match, matching moves to the next regular expression in the list as
+usual.
+
+To mark a regular expression as a filter, it must be specified with an
+expanded definition with a "pattern" component. This is the regular expression
+pattern. It may also have an optional numeric "filter" component. If present,
+and not 0, the regular expression is a filter.
+
+For example, in the following list, the first regex is a filter that
+removes any first sentence starting "...". The following regexs see
+only the text following that sentence.
+
+{
+ "scrape_subtitle": [
+ {
+ "pattern": "^[.][.][.][^:.?!]*[.:?!] +(.*)",
+ "filter": 1
+ },
+ {
+ "pattern": "^[0-9]+/[0-9]+[.] +(.*)",
+ "filter": 1
+ },
+ "^([^:]+): "
+ ]
+}
+
+Given any of the following input texts, the above regex list matches
+'Subtitle here':
+
+...Continued title. 1/6. Subtitle here: rest of summary
+...Continued title. Subtitle here: rest of summary
+1/6. Subtitle here: rest of summary
+Subtitle here: rest of summary
Regular expression engine
-------------------------
#include "eitpatternlist.h"
#include "htsmsg.h"
+#define MAX_TEXT_LEN 2048
+
void eit_pattern_compile_list ( eit_pattern_list_t *list, htsmsg_t *l, int flags )
{
eit_pattern_t *pattern;
htsmsg_field_t *f;
const char *s;
+ int filter;
TAILQ_INIT(list);
if (!l) return;
HTSMSG_FOREACH(f, l) {
s = htsmsg_field_get_str(f);
- if (s == NULL) continue;
+ filter = 0;
+ if (s == NULL) {
+ htsmsg_t *m = htsmsg_field_get_map(f);
+ if (m == NULL) continue;
+ s = htsmsg_get_str(m, "pattern");
+ if (s == NULL) continue;
+ filter = htsmsg_get_bool_or_default(m, "filter", 0);
+ }
pattern = calloc(1, sizeof(eit_pattern_t));
pattern->text = strdup(s);
+ pattern->filter = filter;
if (regex_compile(&pattern->compiled, pattern->text, flags, LS_EPGGRAB)) {
tvhwarn(LS_EPGGRAB, "error compiling pattern \"%s\"", pattern->text);
free(pattern->text);
free(pattern);
} else {
- tvhtrace(LS_EPGGRAB, "compiled pattern \"%s\"", pattern->text);
+ tvhtrace(LS_EPGGRAB, "compiled pattern \"%s\", filter %d", pattern->text, pattern->filter);
TAILQ_INSERT_TAIL(list, pattern, p_links);
}
}
void *eit_pattern_apply_list(char *buf, size_t size_buf, const char *text, eit_pattern_list_t *l)
{
eit_pattern_t *p;
- char matchbuf[2048];
+ char textbuf[MAX_TEXT_LEN];
+ char matchbuf[MAX_TEXT_LEN];
int matchno;
assert(buf);
strncat(buf, matchbuf, size_buf - len - 1);
}
rtrim(buf);
- tvhtrace(LS_EPGGRAB," pattern \"%s\" matches with '%s'", p->text, buf);
+ tvhtrace(LS_EPGGRAB," pattern \"%s\" matches '%s' from '%s'", p->text, buf, text);
+ if (p->filter) {
+ strncpy(textbuf, buf, MAX_TEXT_LEN - 1);
+ textbuf[MAX_TEXT_LEN - 1] = '\0';
+ text = textbuf;
+ continue;
+ }
return buf;
}
return NULL;
class Regex(object):
def __init__(self, engine, regex):
self.engine = engine
- self.regex = regex
+ if isinstance(regex, dict):
+ self.regex = regex["pattern"]
+ self.re_is_filter = (regex["filter"] != 0)
+ else:
+ self.regex = regex
+ self.re_is_filter = False
flags = re_base_flag
if not engine:
flags |= re_posix_flag
- self.regcomp = re.compile(regex, flags)
+ self.regcomp = re.compile(self.regex, flags)
def search(self, text):
match = self.regcomp.search(text)
for regex in regexes:
result = regex.search(text)
if result is not None:
+ if regex.re_is_filter:
+ text = result
+ continue
if result == expect:
print 'OK: Got correct result of "{result}" testing "{testing}" for "{text}" using "{pattern}"'.format(result=result, testing=testing, text=text, pattern=regex.text())
self.num_ok = self.num_ok + 1