From 9693801580c0cf7c70e862d305270a16b52826a7 Mon Sep 17 00:00:00 2001 From: Sebastian Freundt Date: Tue, 20 May 2014 13:26:50 +0000 Subject: [PATCH] Provide ISO 28500:2009 reader (aka warc, aka web archive) --- Makefile.am | 1 + libarchive/CMakeLists.txt | 1 + libarchive/archive.h | 1 + libarchive/archive_read_support_format_all.c | 1 + libarchive/archive_read_support_format_warc.c | 575 ++++++++++++++++++ 5 files changed, 579 insertions(+) create mode 100644 libarchive/archive_read_support_format_warc.c diff --git a/Makefile.am b/Makefile.am index 3f7aea108..10822edfd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -167,6 +167,7 @@ libarchive_la_SOURCES= \ libarchive/archive_read_support_format_rar.c \ libarchive/archive_read_support_format_raw.c \ libarchive/archive_read_support_format_tar.c \ + libarchive/archive_read_support_format_warc.c \ libarchive/archive_read_support_format_xar.c \ libarchive/archive_read_support_format_zip.c \ libarchive/archive_string.c \ diff --git a/libarchive/CMakeLists.txt b/libarchive/CMakeLists.txt index 6d5222650..c4e08311e 100644 --- a/libarchive/CMakeLists.txt +++ b/libarchive/CMakeLists.txt @@ -87,6 +87,7 @@ SET(libarchive_SOURCES archive_read_support_format_rar.c archive_read_support_format_raw.c archive_read_support_format_tar.c + archive_read_support_format_warc.c archive_read_support_format_xar.c archive_read_support_format_zip.c archive_string.c diff --git a/libarchive/archive.h b/libarchive/archive.h index c88673293..49a39e5fa 100644 --- a/libarchive/archive.h +++ b/libarchive/archive.h @@ -399,6 +399,7 @@ __LA_DECL int archive_read_support_format_mtree(struct archive *); __LA_DECL int archive_read_support_format_rar(struct archive *); __LA_DECL int archive_read_support_format_raw(struct archive *); __LA_DECL int archive_read_support_format_tar(struct archive *); +__LA_DECL int archive_read_support_format_warc(struct archive *); __LA_DECL int archive_read_support_format_xar(struct archive *); /* archive_read_support_format_zip() enables both streamable and seekable * zip readers. */ diff --git a/libarchive/archive_read_support_format_all.c b/libarchive/archive_read_support_format_all.c index 53fe6fa39..2127ebd33 100644 --- a/libarchive/archive_read_support_format_all.c +++ b/libarchive/archive_read_support_format_all.c @@ -61,6 +61,7 @@ archive_read_support_format_all(struct archive *a) archive_read_support_format_mtree(a); archive_read_support_format_tar(a); archive_read_support_format_xar(a); + archive_read_support_format_warc(a); /* * Install expensive bidders last. By doing them last, we diff --git a/libarchive/archive_read_support_format_warc.c b/libarchive/archive_read_support_format_warc.c new file mode 100644 index 000000000..62372a219 --- /dev/null +++ b/libarchive/archive_read_support_format_warc.c @@ -0,0 +1,575 @@ +/*- + * Copyright (c) 2014 Sebastian Freundt + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "archive_platform.h" +__FBSDID("$FreeBSD$"); + +#ifdef HAVE_SYS_STAT_H +#include +#endif +#ifdef HAVE_ERRNO_H +#include +#endif +#ifdef HAVE_STDLIB_H +#include +#endif +#ifdef HAVE_STRING_H +#include +#endif +#ifdef HAVE_LIMITS_H +#include +#endif +#ifdef HAVE_CTYPE_H +#include +#endif + +#include "archive.h" +#include "archive_entry.h" +#include "archive_private.h" +#include "archive_read_private.h" + +typedef enum { + WT_NONE, + /* warcinfo */ + WT_INFO, + /* metadata */ + WT_META, + /* resource */ + WT_RSRC, + /* request, unsupported */ + WT_REQ, + /* response, unsupported */ + WT_RSP, + /* revisit, unsupported */ + WT_RVIS, + /* conversion, unsupported */ + WT_CONV, + /* continutation, unsupported at the moment */ + WT_CONT, + /* invalid type */ + LAST_WT +} warc_type_t; + +typedef struct { + size_t len; + const char *str; +} warc_string_t; + +typedef struct { + size_t len; + char *str; +} warc_strbuf_t; + +struct warc_s { + /* content length ahead */ + size_t cntlen; + /* and how much we've processed so far */ + size_t cntoff; + + /* string pool */ + warc_strbuf_t pool; +}; + +static int _warc_bid(struct archive_read *a, int); +static int _warc_cleanup(struct archive_read *a); +static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*); +static int _warc_skip(struct archive_read *a); +static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e); + +/* private routines */ +static unsigned int _warc_rdver(const char buf[static 10U], size_t bsz); +static unsigned int _warc_rdtyp(const char *buf, size_t bsz); +static warc_string_t _warc_rduri(const char *buf, size_t bsz); +static ssize_t _warc_rdlen(const char *buf, size_t bsz); +static const char *_warc_find_eoh(const char *buf, size_t bsz); + + +int +archive_read_support_format_warc(struct archive *_a) +{ + struct archive_read *a = (struct archive_read *)_a; + struct warc_s *w; + int r; + + archive_check_magic(_a, ARCHIVE_READ_MAGIC, + ARCHIVE_STATE_NEW, "archive_read_support_format_warc"); + + if ((w = malloc(sizeof(*w))) == NULL) { + archive_set_error(&a->archive, ENOMEM, + "Can't allocate warc data"); + return (ARCHIVE_FATAL); + } + memset(w, 0, sizeof(*w)); + + r = __archive_read_register_format( + a, w, "warc", + _warc_bid, NULL, _warc_rdhdr, _warc_read, + _warc_skip, NULL, _warc_cleanup, NULL, NULL); + + if (r != ARCHIVE_OK) { + free(w); + return (r); + } + return (ARCHIVE_OK); +} + +static int +_warc_cleanup(struct archive_read *a) +{ + struct warc_s *w = a->format->data; + + if (w->pool.len > 0U) { + free(w->pool.str); + } + free(w); + a->format->data = NULL; + return (ARCHIVE_OK); +} + +static int +_warc_bid(struct archive_read *a, int best_bid) +{ + const char *hdr; + ssize_t nrd; + unsigned int ver; + + (void)best_bid; /* UNUSED */ + + /* check first line of file, it should be a record already */ + if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) { + /* no idea what to do */ + return -1; + } else if (nrd < 12) { + /* nah, not for us, our magic cookie is at least 12 bytes */ + return -1; + } + + /* otherwise snarf the record's version number */ + ver = _warc_rdver(hdr, nrd); + if (ver == 0U || ver > 10000U) { + /* oh oh oh, best not to wager ... */ + return -1; + } + + /* otherwise be confident */ + return (64); +} + +static int +_warc_rdhdr(struct archive_read *a, struct archive_entry *entry) +{ +#define HDR_PROBE_LEN (512U) + struct warc_s *w = a->format->data; + unsigned int ver; + const char *buf; + ssize_t nrd; + const char *eoh; + /* for the file name, saves some strndup()'ing */ + warc_string_t fnam; + /* warc record type, not that we really use it a lot */ + warc_type_t ftyp; + /* content-length+error monad */ + ssize_t cntlen; + +start_over: + /* just use read_ahead() they keep track of unconsumed + * bits and bobs for us; no need to put an extra shift in + * and reproduce that functionality here */ + buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd); + + if (nrd < 0 || buf == NULL) { + /* no good */ + archive_set_error( + &a->archive, ARCHIVE_ERRNO_MISC, + "Bad record header"); + return (ARCHIVE_FATAL); + } else if (nrd < 12U) { + /* there should be room for at least WARC/bla\r\n + * must be EOF therefore */ + return (ARCHIVE_EOF); + } + /* looks good so far, try and find the end of the header now */ + eoh = _warc_find_eoh(buf, nrd); + if (eoh == NULL) { + /* still no good, the header end might be beyond the + * probe we've requested, but then again who'd cram + * so much stuff into the header *and* be 28500-compliant */ + archive_set_error( + &a->archive, ARCHIVE_ERRNO_MISC, + "Bad record header"); + return (ARCHIVE_FATAL); + } else if ((ver = _warc_rdver(buf, eoh - buf)) > 10000U) { + /* nawww, I wish they promised backward compatibility + * anyhoo, in their infinite wisdom the 28500 guys might + * come up with something we can't possibly handle so + * best end things here */ + archive_set_error( + &a->archive, ARCHIVE_ERRNO_MISC, + "Unsupported record version"); + return (ARCHIVE_FATAL); + } else if ((cntlen = _warc_rdlen(buf, eoh - buf)) < 0U) { + /* nightmare! the specs say content-length is mandatory + * so I don't feel overly bad stopping the reader here */ + archive_set_error( + &a->archive, EINVAL, + "Bad content length"); + return (ARCHIVE_FATAL); + } + + /* start off with the type */ + ftyp = _warc_rdtyp(buf, eoh - buf); + /* and let future calls know about the content */ + w->cntlen = cntlen; + w->cntoff = 0U; + + switch (ftyp) { + case WT_RSRC: + case WT_RSP: + /* only try and read the filename in the cases that are + * guaranteed to have one */ + fnam = _warc_rduri(buf, eoh - buf); + /* bang to our string pool, so we save a + * malloc()+free() roundtrip */ + if (fnam.len + 1U > w->pool.len) { + w->pool.len = ((fnam.len + 64U) / 64U) * 64U; + w->pool.str = realloc(w->pool.str, w->pool.len); + } + memcpy(w->pool.str, fnam.str, fnam.len); + w->pool.str[fnam.len] = '\0'; + /* let noone else know about the pool, it's a secret, shhh */ + fnam.str = w->pool.str; + break; + default: + fnam.len = 0U; + break; + } + + /* now eat some of those delicious buffer bits */ + __archive_read_consume(a, eoh - buf); + + switch (ftyp) { + case WT_RSRC: + case WT_RSP: + if (fnam.len > 0U) { + /* populate entry object */ + archive_entry_set_filetype(entry, AE_IFREG); + archive_entry_copy_pathname(entry, fnam.str); + archive_entry_set_size(entry, cntlen); + archive_entry_set_perm(entry, 0644); + break; + } + /*@fallthrough@*/ + default: + /* consume the content and start over */ + _warc_skip(a); + goto start_over; + } + return (ARCHIVE_OK); +} + +static int +_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off) +{ + struct warc_s *w = a->format->data; + const char *rab; + ssize_t nrd; + + if (w->cntoff >= w->cntlen) { + eof: + /* it's our lucky day, no work, we can leave early */ + *buf = NULL; + *bsz = 0U; + *off = w->cntoff; + return (ARCHIVE_EOF); + } + + rab = __archive_read_ahead(a, 1U, &nrd); + if (nrd < 0) { + *bsz = 0U; + /* big catastrophe */ + return (int)nrd; + } else if (nrd == 0) { + goto eof; + } else if (nrd > w->cntlen - w->cntoff) { + /* clamp to content-length */ + nrd = w->cntlen - w->cntoff; + } + *off = w->cntoff; + *bsz = nrd; + *buf = rab; + + w->cntoff += nrd; + return (ARCHIVE_OK); +} + +static int +_warc_skip(struct archive_read *a) +{ + struct warc_s *w = a->format->data; + + __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/); + w->cntlen = 0U; + w->cntoff = 0U; + return (ARCHIVE_OK); +} + + +/* private routines */ +static void* +deconst(const void *c) +{ + return (void*)0x1 + (c - (const void*)0x1); +} + +static char* +xmemmem(const char *haystack, size_t hz, const char *needle, size_t nz) +{ + const char *const eoh = haystack + hz; + const char *const eon = needle + nz; + const char *hp; + const char *np; + const char *cand; + unsigned int hsum; + unsigned int nsum; + unsigned int identicalp; + + /* trivial checks first + * a 0-sized needle is defined to be found anywhere in haystack + * then run strchr() to find a candidate in HAYSTACK (i.e. a portion + * that happens to begin with *NEEDLE) */ + if (nz == 0UL) { + return deconst(haystack); + } else if ((haystack = memchr(haystack, *needle, hz)) == NULL) { + /* trivial */ + return NULL; + } + + /* First characters of haystack and needle are the same now. Both are + * guaranteed to be at least one character long. Now computes the sum + * of characters values of needle together with the sum of the first + * needle_len characters of haystack. */ + for (hp = haystack + 1U, np = needle + 1U, + hsum = *haystack, nsum = *haystack, + identicalp = 1U; + hp < eoh && np < eon; + hsum += *hp, nsum += *np, identicalp = *hp == *np, hp++, np++); + + /* HP now references the (NZ + 1)-th character. */ + if (np < eon) { + /* haystack is smaller than needle, :O */ + return NULL; + } else if (identicalp) { + /* found a match */ + return deconst(haystack); + } + + /* now loop through the rest of haystack, + * updating the sum iteratively */ + for (cand = haystack; hp < eoh; hp++) { + hsum -= *cand++; + hsum += *hp; + + /* Since the sum of the characters is already known to be + * equal at that point, it is enough to check just NZ - 1 + * characters for equality, + * also CAND is by design < HP, so no need for range checks */ + if (hsum == nsum && memcmp(cand, needle, nz - 1U) == 0) { + return deconst(cand); + } + } + return NULL; +} + +static unsigned int +_warc_rdver(const char buf[static 10U], size_t bsz) +{ + static const char magic[] = "WARC/"; + unsigned int ver; + + (void)bsz; /* UNUSED */ + + if (memcmp(buf, magic, sizeof(magic) - 1U) != 0) { + /* nope */ + return 99999U; + } + /* looks good so far, read the version number for a laugh */ + buf += sizeof(magic) - 1U; + /* most common case gets a quick-check here */ + if (memcmp(buf, "1.0\r\n", 5U) == 0) { + ver = 10000U; + } else { + switch (*buf) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + if (buf[1U] == '.') { + char *on; + + /* set up major version */ + ver = (buf[0U] - '0') * 10000U; + /* minor version, anyone? */ + ver += (strtol(buf + 2U, &on, 10)) * 100U; + /* don't parse anything else */ + if (on > buf + 2U) { + break; + } + } + /*@fallthrough@*/ + case '9': + default: + /* just make the version ridiculously high */ + ver = 999999U; + break; + } + } + return ver; +} + +static unsigned int +_warc_rdtyp(const char *buf, size_t bsz) +{ + static const char _key[] = "\r\nWARC-Type:"; + const char *const eob = buf + bsz; + const char *val; + + if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { + /* no bother */ + return WT_NONE; + } + /* overread whitespace */ + for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++); + + if (val + 8U > eob) { + ; + } else if (memcmp(val, "resource", 8U) == 0) { + return WT_RSRC; + } else if (memcmp(val, "warcinfo", 8U) == 0) { + return WT_INFO; + } else if (memcmp(val, "metadata", 8U) == 0) { + return WT_META; + } else if (memcmp(val, "request", 7U) == 0) { + return WT_REQ; + } else if (memcmp(val, "response", 8U) == 0) { + return WT_RSP; + } else if (memcmp(val, "conversi", 8U) == 0) { + return WT_CONV; + } else if (memcmp(val, "continua", 8U) == 0) { + return WT_CONT; + } + return WT_NONE; +} + +static warc_string_t +_warc_rduri(const char *buf, size_t bsz) +{ + static const char _key[] = "\r\nWARC-Target-URI:"; + const char *const eob = buf + bsz; + const char *val; + const char *uri; + const char *eol; + warc_string_t res = {0U}; + + if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { + /* no bother */ + return res; + } + /* overread whitespace */ + for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++); + + /* overread URL designators */ + if ((uri = xmemmem(val, eob - val, "://", 3U)) == NULL) { + /* not touching that! */ + return res; + } else if ((eol = memchr(uri, '\n', eob - uri)) == NULL) { + /* no end of line? :O */ + return res; + } + + /* massage uri to point to after :// */ + uri += 3U; + /* also massage eol to point to the first whitespace + * after the last non-whitespace character before + * the end of the line */ + for (; eol > uri && isspace(eol[-1]); eol--); + + /* now then, inspect the URI */ + if (memcmp(val, "file", 4U) == 0) { + /* perfect, nothing left to do here */ + + } else if (memcmp(val, "http", 4U) == 0 || + memcmp(val, "ftp", 3U) == 0) { + /* overread domain, and the first / */ + while (uri < eol && *uri++ != '/'); + } else { + /* not sure what to do? best to bugger off */ + return res; + } + res.str = uri; + res.len = eol - uri; + return res; +} + +static ssize_t +_warc_rdlen(const char *buf, size_t bsz) +{ + static const char _key[] = "\r\nContent-Length:"; + const char *val; + char *on = NULL; + long int len; + + if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { + /* no bother */ + return -1; + } + + /* strtol kindly overreads whitespace for us, so use that */ + val += sizeof(_key) - 1U; + len = strtol(val, &on, 10); + if (on == NULL || !isspace(*on)) { + /* hm, can we trust that number? Best not. */ + return -1; + } + return (size_t)len; +} + +static const char* +_warc_find_eoh(const char *buf, size_t bsz) +{ + static const char _marker[] = "\r\n\r\n"; + const char *hit = memmem(buf, bsz, _marker, sizeof(_marker) - 1U); + + if (hit != NULL) { + hit += sizeof(_marker) - 1U; + } + return hit; +} + +/* archive_read_support_format_warc.c ends here */ -- 2.47.2