From: Sebastian Freundt Date: Fri, 23 May 2014 08:19:02 +0000 (+0000) Subject: Store and read back mtimes through Last-Modified custom header X-Git-Tag: v3.1.900a~295^2^2~11 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=84114d7db770e6a93115d4645c995a90a89e8477;p=thirdparty%2Flibarchive.git Store and read back mtimes through Last-Modified custom header Signed-off-by: Sebastian Freundt --- diff --git a/libarchive/archive_read_support_format_warc.c b/libarchive/archive_read_support_format_warc.c index 926847bb7..c74922763 100644 --- a/libarchive/archive_read_support_format_warc.c +++ b/libarchive/archive_read_support_format_warc.c @@ -44,6 +44,9 @@ __FBSDID("$FreeBSD$"); #ifdef HAVE_CTYPE_H #include #endif +#ifdef HAVE_TIME_H +#include +#endif #include "archive.h" #include "archive_entry.h" @@ -105,6 +108,8 @@ static unsigned int _warc_rdver(const char buf[static 10U], size_t bsz); static unsigned int _warc_rdtyp(const char *buf, size_t bsz); static warc_string_t _warc_rduri(const char *buf, size_t bsz); static ssize_t _warc_rdlen(const char *buf, size_t bsz); +static time_t _warc_rdrtm(const char *buf, size_t bsz); +static time_t _warc_rdmtm(const char *buf, size_t bsz); static const char *_warc_find_eoh(const char *buf, size_t bsz); @@ -194,6 +199,10 @@ _warc_rdhdr(struct archive_read *a, struct archive_entry *entry) warc_type_t ftyp; /* content-length+error monad */ ssize_t cntlen; + /* record time is the WARC-Date time we reinterpret it as ctime */ + time_t rtime; + /* mtime is the Last-Modified time which will be the entry's mtime */ + time_t mtime; start_over: /* just use read_ahead() they keep track of unconsumed @@ -238,6 +247,13 @@ start_over: &a->archive, EINVAL, "Bad content length"); return (ARCHIVE_FATAL); + } else if ((rtime = _warc_rdrtm(buf, eoh - buf)) == (time_t)-1) { + /* record time is mandatory as per WARC/1.0, + * so just barf here, fast and loud */ + archive_set_error( + &a->archive, EINVAL, + "Bad record time"); + return (ARCHIVE_FATAL); } /* start off with the type */ @@ -262,6 +278,14 @@ start_over: w->pool.str[fnam.len] = '\0'; /* let noone else know about the pool, it's a secret, shhh */ fnam.str = w->pool.str; + + /* snarf mtime or deduce from rtime + * this is a custom header added by our writer, it's quite + * hard to believe anyone else would go through with it + * (apart from being part of some http responses of course) */ + if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) { + mtime = rtime; + } break; default: fnam.len = 0U; @@ -281,6 +305,9 @@ start_over: archive_entry_copy_pathname(entry, fnam.str); archive_entry_set_size(entry, cntlen); archive_entry_set_perm(entry, 0644); + /* rtime is the new ctime, mtime stays mtime */ + archive_entry_set_ctime(entry, rtime, 0L); + archive_entry_set_mtime(entry, mtime, 0L); break; } /*@fallthrough@*/ @@ -411,6 +438,82 @@ xmemmem(const char *haystack, size_t hz, const char *needle, size_t nz) return NULL; } +static int +strtoi_lim(const char *str, const char **ep, int llim, int ulim) +{ + int res = 0; + const char *sp; + /* we keep track of the number of digits via rulim */ + int rulim; + + for (sp = str, rulim = ulim > 10 ? ulim : 10; + res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9'; + sp++, rulim /= 10) { + res *= 10; + res += *sp - '0'; + } + if (sp == str) { + res = -1; + } else if (res < llim || res > ulim) { + res = -2; + } + *ep = (const char*)sp; + return res; +} + +static time_t +xstrpisotime(const char *s, char **endptr) +{ +/** like strptime() but strictly for ISO 8601 Zulu strings */ + struct tm tm; + time_t res = (time_t)-1; + + /* make sure tm is clean */ + memset(&tm, 0, sizeof(tm)); + + /* as a courtesy to our callers, and since this is a non-standard + * routine, we skip leading whitespace */ + for (; isspace(*s); s++); + + /* read year */ + if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') { + goto out; + } + /* read month */ + if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') { + goto out; + } + /* read day-of-month */ + if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') { + goto out; + } + /* read hour */ + if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') { + goto out; + } + /* read minute */ + if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') { + goto out; + } + /* read second */ + if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') { + goto out; + } + + /* massage TM to fulfill some of POSIX' contraints */ + tm.tm_year -= 1900; + tm.tm_mon--; + + /* now convert our custom tm struct to a unix stamp */ + res = mktime(&tm); + +out: + if (endptr != NULL) { + *endptr = deconst(s); + } + return res; +} + static unsigned int _warc_rdver(const char buf[static 10U], size_t bsz) { @@ -569,6 +672,52 @@ _warc_rdlen(const char *buf, size_t bsz) return (size_t)len; } +static time_t +_warc_rdrtm(const char *buf, size_t bsz) +{ + static const char _key[] = "\r\nWARC-Date:"; + const char *val; + char *on = NULL; + time_t res; + + if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { + /* no bother */ + return (time_t)-1; + } + + /* xstrpisotime() kindly overreads whitespace for us, so use that */ + val += sizeof(_key) - 1U; + res = xstrpisotime(val, &on); + if (on == NULL || !isspace(*on)) { + /* hm, can we trust that number? Best not. */ + return (time_t)-1; + } + return res; +} + +static time_t +_warc_rdmtm(const char *buf, size_t bsz) +{ + static const char _key[] = "\r\nLast-Modified:"; + const char *val; + char *on = NULL; + time_t res; + + if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { + /* no bother */ + return (time_t)-1; + } + + /* xstrpisotime() kindly overreads whitespace for us, so use that */ + val += sizeof(_key) - 1U; + res = xstrpisotime(val, &on); + if (on == NULL || !isspace(*on)) { + /* hm, can we trust that number? Best not. */ + return (time_t)-1; + } + return res; +} + static const char* _warc_find_eoh(const char *buf, size_t bsz) { diff --git a/libarchive/archive_write_set_format_warc.c b/libarchive/archive_write_set_format_warc.c index 82c603eed..90d868109 100644 --- a/libarchive/archive_write_set_format_warc.c +++ b/libarchive/archive_write_set_format_warc.c @@ -88,7 +88,8 @@ typedef struct { warc_type_t type; const char *tgturi; const char *recid; - time_t rectim; + time_t rtime; + time_t mtime; const char *cnttyp; size_t cntlen; } warc_essential_hdr_t; @@ -188,7 +189,8 @@ _warc_header(struct archive_write *a, struct archive_entry *entry) WT_INFO, /*uri*/NULL, /*urn*/NULL, - /*tim*/w->now, + /*rtm*/w->now, + /*mtm*/w->now, /*cty*/"application/warc-fields", /*len*/sizeof(warcinfo) - 1U, }; @@ -221,7 +223,8 @@ _warc_header(struct archive_write *a, struct archive_entry *entry) WT_RSRC, /*uri*/archive_entry_pathname(entry), /*urn*/NULL, - /*tim*/w->now, + /*rtm*/w->now, + /*mtm*/archive_entry_mtime(entry), /*cty*/NULL, /*len*/archive_entry_size(entry), }; @@ -313,6 +316,19 @@ _warc_free(struct archive_write *a) x += __r; \ } while (0) +static size_t +xstrftime(char *s, size_t max, const char *fmt, time_t t) +{ +/** like strftime(3) but for time_t objects */ + struct tm *rt; + + if ((rt = gmtime(&t)) == NULL) { + return 0U; + } + /* leave the hard yacker to our role model strftime() */ + return strftime(s, max, fmt, rt); +} + static ssize_t _popul_ehdr(char *tgt, size_t tsz, warc_essential_hdr_t hdr) { @@ -353,18 +369,15 @@ _popul_ehdr(char *tgt, size_t tsz, warc_essential_hdr_t hdr) "WARC-Target-URI: %s%s\r\n", u, hdr.tgturi); } - /* we could essentially put one of mtime/ctime/atime here */ - { - struct tm *rt; + /* record time is usually when the http is sent off, + * just treat the archive writing as such for a moment */ + tp += xstrftime(tp, ep - tp, + "WARC-Date: %FT%H:%M:%SZ\r\n", hdr.rtime); + + /* while we're at it, record the mtime */ + tp += xstrftime(tp, ep - tp, + "Last-Modified: %FT%H:%M:%SZ\r\n", hdr.mtime); - rt = gmtime(&hdr.rectim); - if (rt == NULL) { - return -1; - } - /* next one can't fail, - * at least it won't set TP out of its bounds */ - tp += strftime(tp, ep - tp, "WARC-Date: %FT%H:%M:%SZ\r\n", rt); - } if (hdr.recid == NULL) { /* generate one, grrrr */ warc_uuid_t u;