From: DeltaMikeCharlie <127641886+DeltaMikeCharlie@users.noreply.github.com> Date: Sat, 18 Oct 2025 23:48:28 +0000 (+1100) Subject: Enhancements to XMLTV Parsing X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=588a49dafa062da591460c0313f3afeae91193b4;p=thirdparty%2Ftvheadend.git Enhancements to XMLTV Parsing --- diff --git a/docs/class/epggrabber_modules.md b/docs/class/epggrabber_modules.md index e4ca12d12..19c5adc94 100644 --- a/docs/class/epggrabber_modules.md +++ b/docs/class/epggrabber_modules.md @@ -64,3 +64,97 @@ means broadcast information such as summary information will still be retrieved. --- + +### XMLTV XPath Examples and Notes + +Although XMLTV is a standard, some providers of XMLTV data include additional information. +XPath-like expressions can be used to extract some of this additional information +for EPG grabbers that use XMLTV as a data source. + +!['EPG Grabber XPath'](static/img/doc/channel/grabber_xpath_fields.png) + +##Category Code + +Some information providers include free form category descriptions +that are not compliant with the DVB EIT standard. + +In the following example, 'Cricket' is not a standard DVB EIT category. +However, '0x40' is the standard code for 'Sport' and the provider has +added this code to allow the standard code to be used when needed. + +``` + + Cricket + +``` + +To extract this attribute for use in TVH, we should add `@eit` to the +'Category Code XPath' field. This will extract the hexadecimal code +'0x40' and convert that to the standard category code 'Sport'. + +For the purposes of the category code, the root node is considered to be the +standard `category` node within `programme`. + +##Unique Event Identifier + +By default, XMLTV does not provide a mechanism for uniquely identifying each event. + +In the following example, an XMLTV provider has added the non-standard `uniqueID` +attribute to the `programme` node. +``` + + +``` +To extract this attribute for use in TVH, we should add `@uniqueID` to the +'Unique Event ID XPath' field. This will assign '1234' as the unique +identifier for this EPG event and will allow future updates matching +this ID to be applied. + +For the purposes of the unique ID, the root node is considered to be `programme`. + +##SeriesLink and EpisodeLink + +A CRID (Content Reference IDentifier) is a mechanism used by broadcasters +to identify events from the same series and multiple occurrences +of the same episode in a series. TVH refers to these as 'SeriesLink' +and 'EpisodeLink'. These fields can be used for recording a whole series +or detecting a repeated episode. + +In the following example, the provider has added the non-standard `crid` node to the XMLTV data. +This has been further broken down to include a `series` node and an `episode` node. + +``` + + + crid://provider/abcde + crid://provider/abcde_98765 + + +``` +To extract these values, we should add `//crid/series/text()` and `//crid/episode/text()` +to the 'SeriesLink XPath' and 'EpisodeLink XPath' fields respectively. + +For the purposes of the SeriesLink and EpisodeLink, the root node is +considered to be `programme`. + +##SeriesLink and EpisodeLink Fallbacks + +If the XPath expression does not match any data and these options are enabled, +TVH will perform its standard process for creating 'SeriesLink' and +'EpisodeLink' values, otherwise, the fields will be left empty. + +##Notes + +TVH can only interpret the following subset of XPath identifier syntax: + +/ = Node + +@ = Attribute + +[] = Condition + +text() = Node text + +**Example:** //node1/node2[attrX=value]/@attrY + +--- \ No newline at end of file diff --git a/src/api/api_epg.c b/src/api/api_epg.c index 35718cc27..d82956d94 100644 --- a/src/api/api_epg.c +++ b/src/api/api_epg.c @@ -99,6 +99,12 @@ api_epg_entry ( epg_broadcast_t *eb, const char *lang, const access_t *perm, con /* EPG IDs */ htsmsg_add_u32(m, "eventId", eb->id); + + if(eb->xmltv_eid) //This is the optional external reference provided by XMLTV. + { + htsmsg_add_str(m, "eventId_xmltv", eb->xmltv_eid); + } + if (eb->episodelink && strncasecmp(eb->episodelink->uri, "tvh://", 6)) htsmsg_add_str(m, "episodeUri", eb->episodelink->uri); if (eb->serieslink) diff --git a/src/epg.c b/src/epg.c index c8f160233..588b8753c 100644 --- a/src/epg.c +++ b/src/epg.c @@ -79,6 +79,22 @@ static int _ebc_start_cmp ( const void *a, const void *b ) return ((epg_broadcast_t*)a)->start - ((epg_broadcast_t*)b)->start; } +static int _ebc_xmltv_cmp ( const void *a, const void *b ) +{ + + //Sometimes, nulls are passed to this function and the strcmp() crashes. + if(!((epg_broadcast_t*)a)->xmltv_eid) + { + return -1; + } + if(!((epg_broadcast_t*)b)->xmltv_eid) + { + return 1; + } + + return strcmp(((epg_broadcast_t*)a)->xmltv_eid, ((epg_broadcast_t*)b)->xmltv_eid); +} + void epg_updated ( void ) { epg_object_t *eo; @@ -579,11 +595,25 @@ static epg_broadcast_t *_epg_channel_add_broadcast /* Find (only) */ if ( !create ) { - return RB_FIND(&ch->ch_epg_schedule, *bcast, sched_link, _ebc_start_cmp); + if((*bcast)->xmltv_eid) + { + return RB_FIND(&ch->ch_epg_schedule, *bcast, sched_link, _ebc_xmltv_cmp); + } + else + { + return RB_FIND(&ch->ch_epg_schedule, *bcast, sched_link, _ebc_start_cmp); + } /* Find/Create */ } else { - ret = RB_INSERT_SORTED(&ch->ch_epg_schedule, *bcast, sched_link, _ebc_start_cmp); + if((*bcast)->xmltv_eid) + { + ret = RB_INSERT_SORTED(&ch->ch_epg_schedule, *bcast, sched_link, _ebc_xmltv_cmp); + } + else + { + ret = RB_INSERT_SORTED(&ch->ch_epg_schedule, *bcast, sched_link, _ebc_start_cmp); + } /* New */ if (!ret) { @@ -697,7 +727,7 @@ static epg_broadcast_t *_epg_channel_add_broadcast if (timer) _epg_channel_timer_callback(ch); if (ret->ops->putref(ret)) return NULL; return ret; -} +}// END _epg_channel_add_broadcast void epg_channel_unlink ( channel_t *ch ) { @@ -936,6 +966,40 @@ static epg_broadcast_t **_epg_broadcast_skel ( void ) return &skel; } +//Prepare an EPG struct to search for an extant event +//using the XMLTV unique ID. +epg_broadcast_t *epg_broadcast_find_by_xmltv_eid + ( channel_t *channel, epggrab_module_t *src, + time_t start, time_t stop, int create, + int *save, epg_changes_t *changed, const char *xmltv_eid) +{ + epg_broadcast_t **ebc; + int ret = 0; + if (!channel || !start || !stop || !xmltv_eid) return NULL; + if (stop <= start) return NULL; + if (stop <= gclk()) return NULL; + + ebc = _epg_broadcast_skel(); + (*ebc)->start = start; + (*ebc)->stop = stop; + + if((*ebc)->xmltv_eid) + { + free((*ebc)->xmltv_eid); + (*ebc)->xmltv_eid = NULL; + } + + ret = epg_broadcast_set_xmltv_eid(*ebc, xmltv_eid, changed); + + //If the XMLTV ID was not set, exit. + if(!ret){ + tvherror(LS_EPG, "Unable to set '%s' result '%d'", xmltv_eid, ret); + return NULL; + } + + return _epg_channel_add_broadcast(channel, ebc, src, create, save, changed); +} + epg_broadcast_t *epg_broadcast_find_by_time ( channel_t *channel, epggrab_module_t *src, time_t start, time_t stop, int create, int *save, epg_changes_t *changed ) @@ -948,6 +1012,7 @@ epg_broadcast_t *epg_broadcast_find_by_time ebc = _epg_broadcast_skel(); (*ebc)->start = start; (*ebc)->stop = stop; + (*ebc)->xmltv_eid = NULL; return _epg_channel_add_broadcast(channel, ebc, src, create, save, changed); } @@ -963,7 +1028,10 @@ int epg_broadcast_change_finish if (!(changes & EPG_CHANGED_EPISODE)) save |= epg_broadcast_set_episodelink_uri(broadcast, NULL, NULL); if (!(changes & EPG_CHANGED_DVB_EID)) - save |= epg_broadcast_set_dvb_eid(broadcast, 0, NULL); + { + save |= epg_broadcast_set_dvb_eid(broadcast, 0, NULL); + save |= epg_broadcast_set_xmltv_eid(broadcast, NULL, NULL); + } if (!(changes & EPG_CHANGED_IS_WIDESCREEN)) save |= epg_broadcast_set_is_widescreen(broadcast, 0, NULL); if (!(changes & EPG_CHANGED_IS_HD)) @@ -1041,6 +1109,7 @@ epg_broadcast_t *epg_broadcast_clone 1, save, &changes); if (ebc) { /* Copy metadata */ + *save |= epg_broadcast_set_xmltv_eid(ebc, src->xmltv_eid, &changes); *save |= epg_broadcast_set_is_widescreen(ebc, src->is_widescreen, &changes); *save |= epg_broadcast_set_is_hd(ebc, src->is_hd, &changes); *save |= epg_broadcast_set_is_bw(ebc, src->is_bw, &changes); @@ -1143,6 +1212,17 @@ int epg_broadcast_set_dvb_eid changed, EPG_CHANGED_DVB_EID); } +int epg_broadcast_set_xmltv_eid + ( epg_broadcast_t *b, const char *xmltv_eid, epg_changes_t *changed ) +{ + int save; + if (!b) return 0; + save = _epg_object_set_str(b, &b->xmltv_eid, xmltv_eid, + changed, EPG_CHANGED_DVB_EID); + + return save; +} + int epg_broadcast_set_is_widescreen ( epg_broadcast_t *b, uint8_t ws, epg_changes_t *changed ) { @@ -1559,6 +1639,8 @@ htsmsg_t *epg_broadcast_serialize ( epg_broadcast_t *broadcast ) htsmsg_add_str(m, "ch", channel_get_uuid(broadcast->channel, ubuf)); if (broadcast->dvb_eid) htsmsg_add_u32(m, "eid", broadcast->dvb_eid); + if (broadcast->xmltv_eid) + htsmsg_add_str(m, "xeid", broadcast->xmltv_eid); if (broadcast->is_widescreen) htsmsg_add_u32(m, "is_wd", 1); if (broadcast->is_hd) @@ -1663,6 +1745,8 @@ epg_broadcast_t *epg_broadcast_deserialize /* Get metadata */ if (!htsmsg_get_u32(m, "eid", &eid)) *save |= epg_broadcast_set_dvb_eid(ebc, eid, &changes); + if ((str = htsmsg_get_str(m, "xeid"))) + *save |= epg_broadcast_set_xmltv_eid(ebc, str, &changes); if (!htsmsg_get_u32(m, "is_wd", &u32)) *save |= epg_broadcast_set_is_widescreen(ebc, u32, &changes); if (!htsmsg_get_u32(m, "is_hd", &u32)) diff --git a/src/epg.h b/src/epg.h index da595fd39..930ffd911 100644 --- a/src/epg.h +++ b/src/epg.h @@ -310,6 +310,7 @@ struct epg_broadcast ///< We'll call it copyright_year since words like "complete" and "finished" ///< sound too similar to dvr recorded functionality. We'll only store the ///< year since we only get year not month and day. + char *xmltv_eid; ///< XMLTV (or other) unique event identifier }; /* Lookup */ @@ -318,6 +319,10 @@ epg_broadcast_t *epg_broadcast_find_by_time time_t start, time_t stop, int create, int *save, epg_changes_t *changes ); epg_broadcast_t *epg_broadcast_find_by_eid ( struct channel *ch, uint16_t eid ); epg_broadcast_t *epg_broadcast_find_by_id ( uint32_t id ); +epg_broadcast_t *epg_broadcast_find_by_xmltv_eid + ( struct channel *ch, struct epggrab_module *src, + time_t start, time_t stop, int create, + int *save, epg_changes_t *changed, const char* xmltv_eid); /* Post-modify */ int epg_broadcast_change_finish( epg_broadcast_t *b, epg_changes_t changed, int merge ) @@ -331,6 +336,9 @@ epg_broadcast_t *epg_broadcast_clone int epg_broadcast_set_dvb_eid ( epg_broadcast_t *b, uint16_t dvb_eid, epg_changes_t *changed ) __attribute__((warn_unused_result)); +int epg_broadcast_set_xmltv_eid + ( epg_broadcast_t *b, const char *xmltv_eid, epg_changes_t *changed ) + __attribute__((warn_unused_result)); int epg_broadcast_set_running ( epg_broadcast_t *b, epg_running_t running ) __attribute__((warn_unused_result)); diff --git a/src/epggrab.h b/src/epggrab.h index 590df45df..974bfd615 100644 --- a/src/epggrab.h +++ b/src/epggrab.h @@ -200,6 +200,13 @@ struct epggrab_module_int ///< and extra details on to programme description for viewing by legacy clients. int xmltv_use_category_not_genre; ///< Use category tags and don't map to DVB genres. + const char *xmltv_xpath_category_code; ///< XPath string for extracting a category ETSI code. + const char *xmltv_xpath_unique_id; ///< XPath string for extracting a unique event ID. + const char *xmltv_xpath_series_link; ///< XPath string for extracting a series link. + const char *xmltv_xpath_episode_link; ///< XPath string for extracting an episode link. + int xmltv_xpath_series_use_standard; ///< If the XPath node is not found, use the standard TVH routine. + int xmltv_xpath_episode_use_standard; ///< If the XPath node is not found, use the standard TVH routine. + /* Handle data */ char* (*grab) ( void *mod ); htsmsg_t* (*trans) ( void *mod, char *data ); diff --git a/src/epggrab/module/xmltv.c b/src/epggrab/module/xmltv.c index fe4b4e14e..ef437180d 100644 --- a/src/epggrab/module/xmltv.c +++ b/src/epggrab/module/xmltv.c @@ -14,6 +14,15 @@ * * You should have received a copy of the GNU General Public License * along with this program. If not, see . + * + * Notes - DMC April 2024. + * + * The XMLTV data received is first converted to a htsmsg format. + * Various tags and attributes are then extracted from the htsmsg + * and saved as EPG data. + * + * PLEASE NOTE: TVHeadEnd only processes a subset of the XMLTV schema, + * plus a non-standard tag . */ #include @@ -44,6 +53,16 @@ #define XMLTV_FIND "tv_find_grabbers" #define XMLTV_GRAB "tv_grab_" +/* + * Global variables for XPaths + */ +htsmsg_t *xmltv_xpath_category_code = NULL; +htsmsg_t *xmltv_xpath_unique = NULL; +htsmsg_t *xmltv_xpath_series = NULL; +htsmsg_t *xmltv_xpath_episode = NULL; +int xmltv_xpath_series_fallback = 0; +int xmltv_xpath_episode_fallback = 0; + /* ************************************************************************** * Parsing * *************************************************************************/ @@ -533,6 +552,15 @@ static int _xmltv_parse_age_rating /* * Parse category list + * Leisure hobbies + * Cricket + * NOTE: + * TVH seems to refer to the ETSI code as the 'genre' and to the + * text description as the 'category'. + * There is no ETSI code for 'Cricket', the closest is 0x45 'Team Sports'. + * In the above example, the genre is saved as 0x45, however, if scraping + * for 'extra information' is enabled, the text 'Cricket' will be added to + * the 'category' list. */ static epg_genre_list_t *_xmltv_parse_categories ( htsmsg_t *tags ) @@ -540,10 +568,60 @@ static epg_genre_list_t htsmsg_t *e; htsmsg_field_t *f; epg_genre_list_t *egl = NULL; + const char *cat_name; + uint8_t cat_val; + int cat_flag = 0; + const char *cat_etsi; + HTSMSG_FOREACH(f, tags) { if (!strcmp(htsmsg_field_name(f), "category") && (e = htsmsg_get_map_by_field(f))) { - if (!egl) egl = calloc(1, sizeof(epg_genre_list_t)); - epg_genre_list_add_by_str(egl, htsmsg_get_str(e, "cdata"), NULL); + + cat_name = htsmsg_get_str(e, "cdata"); + + cat_etsi = NULL; + //If we have an XPath expression to search + if(xmltv_xpath_category_code) + { + cat_etsi = htsmsg_xml_xpath_search(e, xmltv_xpath_category_code); + + cat_flag = 0; + + //If we got a category code, use that instead of the text. + //https://www.etsi.org/deliver/etsi_en/300400_300499/300468/01.17.01_20/en_300468v011701a.pdf + //Table 29 + if(cat_etsi && (strlen(cat_etsi) > 2)) + { + tvhdebug(LS_XMLTV, "Identified XPath Category Code: '%s'", cat_etsi); + cat_val = 0; + if(cat_etsi[0] == '0' && (cat_etsi[1] == 'x' || cat_etsi[1] == 'X')) //If the code starts with '0x', look for HEX values. + { + sscanf(cat_etsi+2, "%hhx", &cat_val); + + if(cat_val != 0) + { + tvhdebug(LS_XMLTV, "XPath category code '%s' recognised as ETSI '0x%02x'.", cat_etsi, cat_val); + if (!egl) egl = calloc(1, sizeof(epg_genre_list_t)); + cat_flag = epg_genre_list_add_by_eit (egl, cat_val); + } + else + { + tvhdebug(LS_XMLTV, "XPath category code '%s' failed. Invalid hex.", cat_etsi); + cat_flag = 0; + } + } + }//END we have a category code + else + { + tvhdebug(LS_XMLTV, "XPath category code '%s' unusable, matching text '%s' instead.", cat_etsi, cat_name); + } + }//END we have a category XPath + + //If a hex value was not found or is invalid, use the text value instead. + if(!cat_flag) + { + if (!egl) egl = calloc(1, sizeof(epg_genre_list_t)); + epg_genre_list_add_by_str(egl, cat_name, NULL); + } } } return egl; @@ -717,7 +795,7 @@ static int _xmltv_parse_programme_tags const int use_category_not_genre = ((epggrab_module_int_t *)mod)->xmltv_use_category_not_genre; int save = 0; epg_changes_t changes = 0; - epg_broadcast_t *ebc; + epg_broadcast_t *ebc = NULL; epg_genre_list_t *egl; epg_episode_num_t epnum; epg_set_t *set; @@ -729,6 +807,10 @@ static int _xmltv_parse_programme_tags time_t first_aired = 0; int8_t bw = -1; + const char *temp_unique = htsmsg_get_str(tags, "@@UNIQUE"); + const char *temp_series = htsmsg_get_str(tags, "@@SERIES"); + const char *temp_episode = htsmsg_get_str(tags, "@@EPISODE"); + if (epg_channel_ignore_broadcast(ch, start)) return 0; @@ -737,13 +819,55 @@ static int _xmltv_parse_programme_tags /* * Broadcast */ - ebc = epg_broadcast_find_by_time(ch, mod, start, stop, 1, &save, &changes); + + //If we got a unique XPath field, try to match an existing event based on that, + //if not, use the normal match based on start time only. + if(temp_unique) + { + tvhtrace(LS_XMLTV, "Searching for EPG event using XPath unique ID '%s'.", temp_unique); + ebc = epg_broadcast_find_by_xmltv_eid(ch, mod, start, stop, 1, &save, &changes, temp_unique); + //NULL will be returned if there is no match found. + if(ebc) + { + tvhtrace(LS_XMLTV, "Matched ID '%s' start '%"PRItime_t"/%"PRItime_t"' stop '%"PRItime_t"/%"PRItime_t"'.", temp_unique, ebc->start, start, ebc->stop, stop); + ebc->start = start; + ebc->stop = stop; + } + else + { + tvhtrace(LS_XMLTV, "No match for EPG event using XPath unique ID '%s'.", temp_unique); + } + } + + //If the broadcast event is still null, then either there was no XMLTV unique ID + //or there was, but it failed to match. The later is an edge case when this feature + //has been newly enabled with existing events already present. They will not match + //they expire. + if(!ebc) + { + tvhtrace(LS_XMLTV, "Searching for EPG event using start/stop."); + ebc = epg_broadcast_find_by_time(ch, mod, start, stop, 1, &save, &changes); + if(ebc){ + tvhtrace(LS_XMLTV, "Matched EPG event using start/stop."); + } + else + { + tvhtrace(LS_XMLTV, "No match for EPG event using start/stop."); + } + } + if (!ebc) return 0; stats->broadcasts.total++; if (save && (changes & EPG_CHANGED_CREATE)) stats->broadcasts.created++; + /* Save the unique ID string */ + if(temp_unique) + { + save |= epg_broadcast_set_xmltv_eid(ebc, temp_unique, &changes); + } + /* Description/summary (wait for episode first) */ _xmltv_parse_lang_str(&desc, tags, "desc"); _xmltv_parse_lang_str(&summary, tags, "summary"); @@ -810,6 +934,53 @@ static int _xmltv_parse_programme_tags */ get_episode_info(mod, tags, &uri, &suri, &epnum); + if(temp_series) + { + if(suri) + { + free(suri); + } + suri = strdup(temp_series); + } + else + { + //If there was an XPath for series, but nothing was found + //AND we are NOT falling back to the standard method, + //then erase the crid that TVH manufactured from the module/series/episode. + if(xmltv_xpath_series && !xmltv_xpath_series_fallback) + { + if(suri) + { + free(suri); + suri = NULL; + } + } + + } + + if(temp_episode) + { + if(uri) + { + free(uri); + } + uri = strdup(temp_episode); + } + else + { + //If there was an XPath for episode, but nothing was found + //AND we are NOT falling back to the standard method, + //then erase the crid that TVH manufactured from the module/series/episode. + if(xmltv_xpath_episode && !xmltv_xpath_episode_fallback) + { + if(uri) + { + free(uri); + uri = NULL; + } + } + + } /* * Series Link */ @@ -832,6 +1003,10 @@ static int _xmltv_parse_programme_tags if (uri) { set = ebc->episodelink; save |= epg_broadcast_set_episodelink_uri(ebc, uri, &changes); + //DMC 28-Mar-2024. + //This free() was added because compared to the series link above + //it looked like not having it would lead to a memory leak. + free(uri); stats->episodes.total++; if (changes & EPG_CHANGED_EPISODE) { if (set == NULL) @@ -916,6 +1091,7 @@ static int _xmltv_parse_programme stats->channels.modified++; } if (!LIST_FIRST(&ec->channels)) return 0; + if((s = htsmsg_get_str(attribs, "start")) == NULL) return 0; start = _xmltv_str2time(s); if((s = htsmsg_get_str(attribs, "stop")) == NULL) return 0; @@ -925,6 +1101,47 @@ static int _xmltv_parse_programme (attribs = htsmsg_get_map(subtag, "attrib")) != NULL) icon = htsmsg_get_str(attribs, "src"); + const char *temp_unique; + const char *temp_series; + const char *temp_episode; + + //NOTE - DMC April 2024 + //The XPath values need to be searched for here, before the rest of the processing, + //because the attributes of the root node are not available past + //this point. Only sub-nodes of are passed on to the next function. + //If XPath values are found here, add them to the htsmsg using special '@@' + //field names which can then be passed to the next function for further processing. + + //Search the current programme for XPath matches + if(xmltv_xpath_unique) + { + temp_unique = htsmsg_xml_xpath_search(body, xmltv_xpath_unique); + //If an XPath ID has been found, stash it in htsmsg so that it can + //be retrieved by the next function. + if(temp_unique) + { + htsmsg_add_str(tags, "@@UNIQUE", temp_unique); + } + }//END stash the XPath unique ID + + if(xmltv_xpath_series) + { + temp_series = htsmsg_xml_xpath_search(body, xmltv_xpath_series); + if(temp_series) + { + htsmsg_add_str(tags, "@@SERIES", temp_series); + } + } + + if(xmltv_xpath_episode) + { + temp_episode = htsmsg_xml_xpath_search(body, xmltv_xpath_episode); + if(temp_episode) + { + htsmsg_add_str(tags, "@@EPISODE", temp_episode); + } + } + if(stop <= start || stop <= gclk()) return 0; ec->laststamp = gclk(); @@ -1028,7 +1245,16 @@ static int _xmltv_parse_channel } /** - * + * + * + * ...channel data + * <\channel> + * ...multiple channels + * + * ...programme data + * <\programme> + * ...multiple programmes + * */ static int _xmltv_parse_tv (epggrab_module_t *mod, htsmsg_t *body, epggrab_stats_t *stats) @@ -1040,6 +1266,82 @@ static int _xmltv_parse_tv if((tags = htsmsg_get_map(body, "tags")) == NULL) return 0; + //Pre-process the XPaths + //Only done once per XMLTV session. + if(((epggrab_module_int_t *)mod)->xmltv_xpath_category_code) + { + tvhtrace(LS_XMLTV, "Parsing Category Code XPath: '%s'.", ((epggrab_module_int_t *)mod)->xmltv_xpath_category_code); + xmltv_xpath_category_code = htsmsg_xml_parse_xpath(((epggrab_module_int_t *)mod)->xmltv_xpath_category_code); + + if(htsmsg_is_empty(xmltv_xpath_category_code)) + { + tvhtrace(LS_XMLTV, "Failed to parse Category Code XPath '%s'.", ((epggrab_module_int_t *)mod)->xmltv_xpath_category_code); + } + } + else + { + tvhtrace(LS_XMLTV, "Category Code XPath not found."); + } + + if(((epggrab_module_int_t *)mod)->xmltv_xpath_unique_id) + { + tvhtrace(LS_XMLTV, "Parsing Unique ID XPath: '%s'.", ((epggrab_module_int_t *)mod)->xmltv_xpath_unique_id); + xmltv_xpath_unique = htsmsg_xml_parse_xpath(((epggrab_module_int_t *)mod)->xmltv_xpath_unique_id); + + if(htsmsg_is_empty(xmltv_xpath_unique)) + { + tvhtrace(LS_XMLTV, "Failed to parse Unique ID XPath '%s'.", ((epggrab_module_int_t *)mod)->xmltv_xpath_unique_id); + } + } + else + { + tvhtrace(LS_XMLTV, "Unique ID XPath not found."); + } + + if(((epggrab_module_int_t *)mod)->xmltv_xpath_series_link) + { + tvhtrace(LS_XMLTV, "Parsing SeriesLink XPath: '%s'.", ((epggrab_module_int_t *)mod)->xmltv_xpath_series_link); + xmltv_xpath_series = htsmsg_xml_parse_xpath(((epggrab_module_int_t *)mod)->xmltv_xpath_series_link); + + if(htsmsg_is_empty(xmltv_xpath_series)) + { + tvhtrace(LS_XMLTV, "Failed to parse SeriesLink XPath '%s'.", ((epggrab_module_int_t *)mod)->xmltv_xpath_series_link); + } + } + else + { + tvhtrace(LS_XMLTV, "SeriesLink XPath not found."); + } + + if(((epggrab_module_int_t *)mod)->xmltv_xpath_episode_link) + { + tvhtrace(LS_XMLTV, "Parsing EpisodeLink XPath: '%s'.", ((epggrab_module_int_t *)mod)->xmltv_xpath_episode_link); + xmltv_xpath_episode = htsmsg_xml_parse_xpath(((epggrab_module_int_t *)mod)->xmltv_xpath_episode_link); + + if(htsmsg_is_empty(xmltv_xpath_episode)) + { + tvhtrace(LS_XMLTV, "Failed to parse EpisodeLink XPath '%s'.", ((epggrab_module_int_t *)mod)->xmltv_xpath_episode_link); + } + } + else + { + tvhtrace(LS_XMLTV, "EpisodeLink XPath not found."); + } + + //Set the fallback flags. + xmltv_xpath_series_fallback = 0; + if(((epggrab_module_int_t *)mod)->xmltv_xpath_series_use_standard) + { + xmltv_xpath_series_fallback = 1; + } + + xmltv_xpath_episode_fallback = 0; + if(((epggrab_module_int_t *)mod)->xmltv_xpath_episode_use_standard) + { + xmltv_xpath_episode_fallback = 1; + } + //Finished pre-processing the XPath stuff. + tvh_mutex_lock(&global_lock); epggrab_channel_begin_scan(mod); tvh_mutex_unlock(&global_lock); @@ -1063,6 +1365,23 @@ static int _xmltv_parse_tv epggrab_channel_end_scan(mod); tvh_mutex_unlock(&global_lock); + //If XPaths were used, release the parsed paths. + if(xmltv_xpath_unique) + { + htsmsg_destroy(xmltv_xpath_unique); + } + if(xmltv_xpath_series) + { + htsmsg_destroy(xmltv_xpath_series); + } + if(xmltv_xpath_episode) + { + htsmsg_destroy(xmltv_xpath_episode); + } + if(xmltv_xpath_category_code) + { + htsmsg_destroy(xmltv_xpath_category_code); + } return gsave; } @@ -1122,6 +1441,42 @@ static int _xmltv_parse "If this option is not ticked then we continue to map " \ "xmltv categories to genres and supply both to clients.") +#define XPATH_CATEGORY_CODE N_("Category Code XPath") +#define XPATH_CATEGORY_CODE_DESC \ + N_("The XPath-like expression used to extract the category "\ + "ETSI code from the XMLTV data. Root node = 'category'.") + +#define XPATH_UNIQUE_ID_NAME N_("Unique Event ID XPath") +#define XPATH_UNIQUE_ID_DESC \ + N_("The XPath-like expression used to extract a unique event "\ + "identifier from the XMLTV data. This ID is used to "\ + "match existing EPG events so that they can be updated " \ + "rather than replaced. Root node = 'programme'.") + +#define XPATH_SERIES_LINK_NAME N_("SeriesLink XPath") +#define XPATH_SERIES_LINK_DESC \ + N_("The XPath-like expression used to extract a SeriesLink "\ + "identifier from the XMLTV data. This ID is used "\ + "to identify multiple occurrences of the same series. "\ + " Root node = 'programme'.") + +#define XPATH_EPISODE_LINK_NAME N_("EpisodeLink XPath") +#define XPATH_EPISODE_LINK_DESC \ + N_("The XPath-like expression used to extract an EpisodeLink "\ + "identifier from the XMLTV data. This ID is used "\ + "to identify multiple occurrences of the same episode. "\ + " Root node = 'programme'.") + +#define XPATH_SERIES_USE_STANDARD_NAME N_("SeriesLink XPath fallback") +#define XPATH_SERIES_USE_STANDARD_DESC \ + N_("If a SeriesLink XPath is not found, use the standard TVH "\ + "method for creating a SeriesLink.") + +#define XPATH_EPISODE_USE_STANDARD_NAME N_("EpisodeLink XPath fallback") +#define XPATH_EPISODE_USE_STANDARD_DESC \ + N_("If an EpisodeLink XPath is not found, use the standard TVH "\ + "method for creating an EpisodeLink.") + static htsmsg_t * xmltv_dn_chnum_list ( void *o, const char *lang ) { @@ -1137,6 +1492,17 @@ const idclass_t epggrab_mod_int_xmltv_class = { .ic_super = &epggrab_mod_int_class, .ic_class = "epggrab_mod_int_xmltv", .ic_caption = N_("EPG - Internal XMLTV EPG Grabber"), + .ic_groups = (const property_group_t[]) { + { + .name = N_("General Settings"), + .number = 1, + }, + { + .name = N_("XPath Settings"), + .number = 2, + }, + {} + }, .ic_properties = (const property_t[]){ { .type = PT_INT, @@ -1172,6 +1538,54 @@ const idclass_t epggrab_mod_int_xmltv_class = { .off = offsetof(epggrab_module_int_t, xmltv_use_category_not_genre), .group = 1 }, + { + .type = PT_STR, + .id = "xpath_category_code", + .name = XPATH_CATEGORY_CODE, + .desc = XPATH_CATEGORY_CODE_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_category_code), + .group = 2 + }, + { + .type = PT_STR, + .id = "xpath_unique", + .name = XPATH_UNIQUE_ID_NAME, + .desc = XPATH_UNIQUE_ID_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_unique_id), + .group = 2 + }, + { + .type = PT_STR, + .id = "xpath_serieslink", + .name = XPATH_SERIES_LINK_NAME, + .desc = XPATH_SERIES_LINK_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_series_link), + .group = 2 + }, + { + .type = PT_STR, + .id = "xpath_episodelink", + .name = XPATH_EPISODE_LINK_NAME, + .desc = XPATH_EPISODE_LINK_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_episode_link), + .group = 2 + }, + { + .type = PT_BOOL, + .id = "xpath_series_use_standard", + .name = XPATH_SERIES_USE_STANDARD_NAME, + .desc = XPATH_SERIES_USE_STANDARD_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_series_use_standard), + .group = 2 + }, + { + .type = PT_BOOL, + .id = "xpath_episode_use_standard", + .name = XPATH_EPISODE_USE_STANDARD_NAME, + .desc = XPATH_EPISODE_USE_STANDARD_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_episode_use_standard), + .group = 2 + }, {} } }; @@ -1180,6 +1594,17 @@ const idclass_t epggrab_mod_ext_xmltv_class = { .ic_super = &epggrab_mod_ext_class, .ic_class = "epggrab_mod_ext_xmltv", .ic_caption = N_("EPG - External XMLTV EPG Grabber"), + .ic_groups = (const property_group_t[]) { + { + .name = N_("General Settings"), + .number = 1, + }, + { + .name = N_("XPath Settings"), + .number = 2, + }, + {} + }, .ic_properties = (const property_t[]){ { .type = PT_BOOL, @@ -1214,6 +1639,54 @@ const idclass_t epggrab_mod_ext_xmltv_class = { .off = offsetof(epggrab_module_int_t, xmltv_use_category_not_genre), .group = 1 }, + { + .type = PT_STR, + .id = "xpath_category_code", + .name = XPATH_CATEGORY_CODE, + .desc = XPATH_CATEGORY_CODE_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_category_code), + .group = 2 + }, + { + .type = PT_STR, + .id = "xpath_unique", + .name = XPATH_UNIQUE_ID_NAME, + .desc = XPATH_UNIQUE_ID_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_unique_id), + .group = 2 + }, + { + .type = PT_STR, + .id = "xpath_serieslink", + .name = XPATH_SERIES_LINK_NAME, + .desc = XPATH_SERIES_LINK_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_series_link), + .group = 2 + }, + { + .type = PT_STR, + .id = "xpath_episodelink", + .name = XPATH_EPISODE_LINK_NAME, + .desc = XPATH_EPISODE_LINK_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_episode_link), + .group = 2 + }, + { + .type = PT_BOOL, + .id = "xpath_series_use_standard", + .name = XPATH_SERIES_USE_STANDARD_NAME, + .desc = XPATH_SERIES_USE_STANDARD_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_series_use_standard), + .group = 2 + }, + { + .type = PT_BOOL, + .id = "xpath_episode_use_standard", + .name = XPATH_EPISODE_USE_STANDARD_NAME, + .desc = XPATH_EPISODE_USE_STANDARD_DESC, + .off = offsetof(epggrab_module_int_t, xmltv_xpath_episode_use_standard), + .group = 2 + }, {} } }; diff --git a/src/htsmsg_xml.c b/src/htsmsg_xml.c index 4123e74d6..62ff1f384 100644 --- a/src/htsmsg_xml.c +++ b/src/htsmsg_xml.c @@ -25,7 +25,7 @@ * htsmsg's with UTF-8 encoded payloads * * Supports: Example: - * + * * Comments * Processing Instructions * CDATA ]]> @@ -96,7 +96,7 @@ typedef struct cdata_content { char cc_buf[0]; } cdata_content_t; -static char *htsmsg_xml_parse_cd(xmlparser_t *xp, +static char *htsmsg_xml_parse_cd(xmlparser_t *xp, htsmsg_t *parent, char *src); /** @@ -245,7 +245,7 @@ htsmsg_xml_parse_attrib while(is_xmlws(*src)) src++; - + /* Parse attribute payload */ quote = *src++; if(quote != '"' && quote != '\'') { @@ -274,7 +274,7 @@ htsmsg_xml_parse_attrib while(is_xmlws(*src)) src++; - if(xmlns_scope_list != NULL && + if(xmlns_scope_list != NULL && attriblen > 6 && !memcmp(attribname, "xmlns:", 6)) { attribname += 6; @@ -383,7 +383,7 @@ htsmsg_xml_parse_tag(xmlparser_t *xp, htsmsg_t *parent, char *src) if(tagname[i] == ':') { LIST_FOREACH(ns, &xp->xp_namespaces, xmlns_global_link) { - if(ns->xmlns_prefix_len == i && + if(ns->xmlns_prefix_len == i && !memcmp(ns->xmlns_prefix, tagname, ns->xmlns_prefix_len)) { int llen = taglen - i - 1; @@ -608,7 +608,7 @@ htsmsg_xml_parse_cd0 src = htsmsg_xml_parse_tag(xp, tags, src); continue; } - + if(*src == '&' && !raw) { if(cc != NULL) cc->cc_end = src; @@ -664,7 +664,7 @@ htsmsg_xml_parse_cd(xmlparser_t *xp, htsmsg_t *parent, char *src) int c = 0, l, y = 0; char *x, *body; htsmsg_t *tags = htsmsg_create_map(); - + TAILQ_INIT(&ccq); src = htsmsg_xml_parse_cd0(xp, &ccq, tags, NULL, src, 0); @@ -697,7 +697,7 @@ htsmsg_xml_parse_cd(xmlparser_t *xp, htsmsg_t *parent, char *src) assert(cc != NULL); assert(TAILQ_NEXT(cc, cc_link) == NULL); - + f = htsmsg_field_add(parent, "cdata", HMF_STR, 0, 0); f->hmf_str = cc->cc_start; *cc->cc_end = 0; @@ -721,7 +721,7 @@ htsmsg_xml_parse_cd(xmlparser_t *xp, htsmsg_t *parent, char *src) c += put_utf8(body + c, *x); break; } - + TAILQ_REMOVE(&ccq, cc, cc_link); free(cc); } @@ -767,7 +767,7 @@ htsmsg_parse_prolog(xmlparser_t *xp, char *src) while(is_xmlws(*src)) src++; - + if(!strncmp(src, " sizeof(outStr) - 1) + { + //Formatting note: + //In 64 bit Ubuntu, sizeof() returns a 'long unsigned int' + //In 32 bit i386-debian-strech, sizeof() returns an 'unsigned int' + //This causes cross compile issues. + //'%zu' is supposed to work for 'size_t' variables in C99. + + tvhtrace(LS_XMLTV, "XPath = '%s' too long, max len = %zu.", xpath, (sizeof(outStr) - 1)); + return NULL; + } + + memset(outStr, 0, sizeof(outStr)); + + for(inPos = 0; inPos < xpLen; inPos++) //Loop through the xpath string + { + xpType = 0; //Keep byte + if(xpath[inPos] == '/') + { + xpType = 1; //Node + } + else if (xpath[inPos] == '@' && xpTypeSaved != 3) + { + xpType = 2; //Attribute + } + else if (xpath[inPos] == '[') + { + xpType = 3; //Condition + } + + //Add this byte to the existing item + if (xpType == 0 && xpath[inPos] != ']') + { + outStr[outPos] = xpath[inPos]; + outPos++; + outStr[outPos] = 0; + outType = xpTypeSaved; + } + else + { + xpTypeSaved = xpType; + } + + if(inPos == (xpLen - 1) || (xpType != 0 && outPos != 0 )) + { + + if(outType != 0) + { + + if(outType == 1 && !strcmp(outStr, "text()")) + { + outType = 4; + } + + condAtt[0] = 0; + condVal[0] = 0; + if(outType == 3) + { + sscanf(outStr, "@%[^'=']=%s", condAtt, condVal); + } + + f = htsmsg_create_map(); + + if(outType == 3) + { + htsmsg_add_str(f, "n", condAtt); //Condition attribute + } + else + { + htsmsg_add_str(f, "n", outStr); //Name + } + + htsmsg_add_s64(f, "t", outType); //Type + htsmsg_add_str(f, "v", condVal); //Condition value + + snprintf(inPosStr, sizeof(inPosStr), "%d", inPos); + htsmsg_add_msg(m, inPosStr, f); + + } + + outPos = 0; + outStr[0] = 0; + outType = 0; + } + + }//END for loop through string + + return m; + +} + +/** + * Take a htsmsg holding an XML object model + * and a htsmsg holding an XPath model and + * try to match the XPath to a node or + * attribute. + */ +const char *htsmsg_xml_xpath_search(htsmsg_t *message, htsmsg_t *xpath) +{ + htsmsg_t *temp_msg; + htsmsg_t *temp_path; + int64_t temp_type; + htsmsg_field_t *f; + htsmsg_t *attribs; + htsmsg_t *tags; + htsmsg_t *pass_tags = NULL; + const char *value; + const char *criteria; + const char *str_saved; + + temp_msg = message; + str_saved = NULL; + + HTSMSG_FOREACH(f, xpath) { + + temp_path = htsmsg_get_map(xpath, htsmsg_field_name(f)); + htsmsg_get_s64(temp_path, "t", &temp_type); + tvhdebug(LS_XMLTV, "htsmsg_xml_xpath_search '%s' = '%s', '%"PRIu64"', '%s'", htsmsg_field_name(f), htsmsg_get_str(temp_path, "n"), temp_type, htsmsg_get_str(temp_path, "v")); + + if(temp_type == 4) //This item returns the text of the previous matched XML node + { + return str_saved; + } + + if(temp_type == 1) //This item deals with an XML node + { + str_saved = NULL; + + if((tags = htsmsg_get_map(temp_msg, "tags")) == NULL) + { + tvherror(LS_XMLTV, "Failed to find tags"); + return NULL; + } + + if((pass_tags = htsmsg_get_map(tags, htsmsg_get_str(temp_path, "n"))) == NULL) + { + tvherror(LS_XMLTV, "Failed to match '%s'", htsmsg_get_str(temp_path, "n")); + return NULL; + } + else + { + tvhdebug(LS_XMLTV, "Matched node '%s'", htsmsg_get_str(temp_path, "n")); + str_saved = htsmsg_get_str(pass_tags, "cdata"); + temp_msg = pass_tags; + } + }//END of node type + + if(temp_type == 2 || temp_type == 3) //This items deal with an XML attribute. + { + if((attribs = htsmsg_get_map(temp_msg, "attrib")) == NULL) return NULL; + if((value = htsmsg_get_str(attribs, htsmsg_get_str(temp_path, "n"))) == NULL) + { + return NULL; + } + else + { + if(temp_type == 2) //If this is a simple attribute, return the value. + { + tvhdebug(LS_XMLTV, "Returning attribute value '%s'", value); + return value; + }//END just return attribute value + + if(temp_type == 3) //If this is an attribute comparison, compare it. + { + if((criteria = htsmsg_get_str(temp_path, "v")) == NULL) + { + tvherror(LS_XMLTV, "NO CRITERIA '%s'", htsmsg_get_str(temp_path, "v")); + return NULL; + } + else + { + tvhdebug(LS_XMLTV, "COMPARING: '%s' to '%s'", value, criteria); + if(!strcmp(value, criteria)) + { + //Continue the search to the next XPath item, but not the next node + tvhdebug(LS_XMLTV, "MATCHED: '%s' to '%s'", value, criteria); + } + else + { + //Return an abject failure in disgrace. + return NULL; + } + } + }//END attribute value comparison + }//END found the attribute being searched for + }//END of attribute type + + }//END loop through each XPath item. + + return NULL; + +} diff --git a/src/htsmsg_xml.h b/src/htsmsg_xml.h index c4d4f29dc..b965b4653 100644 --- a/src/htsmsg_xml.h +++ b/src/htsmsg_xml.h @@ -27,5 +27,7 @@ const char *htsmsg_xml_get_cdata_str (htsmsg_t *tags, const char *tag); int htsmsg_xml_get_cdata_u32 (htsmsg_t *tags, const char *tag, uint32_t *u32); const char *htsmsg_xml_get_attr_str(htsmsg_t *tag, const char *attr); int htsmsg_xml_get_attr_u32(htsmsg_t *tag, const char *attr, uint32_t *u32); +htsmsg_t *htsmsg_xml_parse_xpath(const char *xpath); +const char *htsmsg_xml_xpath_search(htsmsg_t *tag, htsmsg_t *xpath); #endif /* HTSMSG_XML_H_ */ diff --git a/src/webui/static/img/doc/channel/grabber_xpath_fields.png b/src/webui/static/img/doc/channel/grabber_xpath_fields.png new file mode 100755 index 000000000..a69ca486a Binary files /dev/null and b/src/webui/static/img/doc/channel/grabber_xpath_fields.png differ