--- /dev/null
+/*-
+ * Copyright (c) 2014 Sebastian Freundt
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "archive_platform.h"
+__FBSDID("$FreeBSD$");
+
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#endif
+#ifdef HAVE_CTYPE_H
+#include <ctype.h>
+#endif
+
+#include "archive.h"
+#include "archive_entry.h"
+#include "archive_private.h"
+#include "archive_read_private.h"
+
+typedef enum {
+ WT_NONE,
+ /* warcinfo */
+ WT_INFO,
+ /* metadata */
+ WT_META,
+ /* resource */
+ WT_RSRC,
+ /* request, unsupported */
+ WT_REQ,
+ /* response, unsupported */
+ WT_RSP,
+ /* revisit, unsupported */
+ WT_RVIS,
+ /* conversion, unsupported */
+ WT_CONV,
+ /* continutation, unsupported at the moment */
+ WT_CONT,
+ /* invalid type */
+ LAST_WT
+} warc_type_t;
+
+typedef struct {
+ size_t len;
+ const char *str;
+} warc_string_t;
+
+typedef struct {
+ size_t len;
+ char *str;
+} warc_strbuf_t;
+
+struct warc_s {
+ /* content length ahead */
+ size_t cntlen;
+ /* and how much we've processed so far */
+ size_t cntoff;
+
+ /* string pool */
+ warc_strbuf_t pool;
+};
+
+static int _warc_bid(struct archive_read *a, int);
+static int _warc_cleanup(struct archive_read *a);
+static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
+static int _warc_skip(struct archive_read *a);
+static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
+
+/* private routines */
+static unsigned int _warc_rdver(const char buf[static 10U], size_t bsz);
+static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
+static warc_string_t _warc_rduri(const char *buf, size_t bsz);
+static ssize_t _warc_rdlen(const char *buf, size_t bsz);
+static const char *_warc_find_eoh(const char *buf, size_t bsz);
+
+\f
+int
+archive_read_support_format_warc(struct archive *_a)
+{
+ struct archive_read *a = (struct archive_read *)_a;
+ struct warc_s *w;
+ int r;
+
+ archive_check_magic(_a, ARCHIVE_READ_MAGIC,
+ ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
+
+ if ((w = malloc(sizeof(*w))) == NULL) {
+ archive_set_error(&a->archive, ENOMEM,
+ "Can't allocate warc data");
+ return (ARCHIVE_FATAL);
+ }
+ memset(w, 0, sizeof(*w));
+
+ r = __archive_read_register_format(
+ a, w, "warc",
+ _warc_bid, NULL, _warc_rdhdr, _warc_read,
+ _warc_skip, NULL, _warc_cleanup, NULL, NULL);
+
+ if (r != ARCHIVE_OK) {
+ free(w);
+ return (r);
+ }
+ return (ARCHIVE_OK);
+}
+
+static int
+_warc_cleanup(struct archive_read *a)
+{
+ struct warc_s *w = a->format->data;
+
+ if (w->pool.len > 0U) {
+ free(w->pool.str);
+ }
+ free(w);
+ a->format->data = NULL;
+ return (ARCHIVE_OK);
+}
+
+static int
+_warc_bid(struct archive_read *a, int best_bid)
+{
+ const char *hdr;
+ ssize_t nrd;
+ unsigned int ver;
+
+ (void)best_bid; /* UNUSED */
+
+ /* check first line of file, it should be a record already */
+ if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
+ /* no idea what to do */
+ return -1;
+ } else if (nrd < 12) {
+ /* nah, not for us, our magic cookie is at least 12 bytes */
+ return -1;
+ }
+
+ /* otherwise snarf the record's version number */
+ ver = _warc_rdver(hdr, nrd);
+ if (ver == 0U || ver > 10000U) {
+ /* oh oh oh, best not to wager ... */
+ return -1;
+ }
+
+ /* otherwise be confident */
+ return (64);
+}
+
+static int
+_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
+{
+#define HDR_PROBE_LEN (512U)
+ struct warc_s *w = a->format->data;
+ unsigned int ver;
+ const char *buf;
+ ssize_t nrd;
+ const char *eoh;
+ /* for the file name, saves some strndup()'ing */
+ warc_string_t fnam;
+ /* warc record type, not that we really use it a lot */
+ warc_type_t ftyp;
+ /* content-length+error monad */
+ ssize_t cntlen;
+
+start_over:
+ /* just use read_ahead() they keep track of unconsumed
+ * bits and bobs for us; no need to put an extra shift in
+ * and reproduce that functionality here */
+ buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
+
+ if (nrd < 0 || buf == NULL) {
+ /* no good */
+ archive_set_error(
+ &a->archive, ARCHIVE_ERRNO_MISC,
+ "Bad record header");
+ return (ARCHIVE_FATAL);
+ } else if (nrd < 12U) {
+ /* there should be room for at least WARC/bla\r\n
+ * must be EOF therefore */
+ return (ARCHIVE_EOF);
+ }
+ /* looks good so far, try and find the end of the header now */
+ eoh = _warc_find_eoh(buf, nrd);
+ if (eoh == NULL) {
+ /* still no good, the header end might be beyond the
+ * probe we've requested, but then again who'd cram
+ * so much stuff into the header *and* be 28500-compliant */
+ archive_set_error(
+ &a->archive, ARCHIVE_ERRNO_MISC,
+ "Bad record header");
+ return (ARCHIVE_FATAL);
+ } else if ((ver = _warc_rdver(buf, eoh - buf)) > 10000U) {
+ /* nawww, I wish they promised backward compatibility
+ * anyhoo, in their infinite wisdom the 28500 guys might
+ * come up with something we can't possibly handle so
+ * best end things here */
+ archive_set_error(
+ &a->archive, ARCHIVE_ERRNO_MISC,
+ "Unsupported record version");
+ return (ARCHIVE_FATAL);
+ } else if ((cntlen = _warc_rdlen(buf, eoh - buf)) < 0U) {
+ /* nightmare! the specs say content-length is mandatory
+ * so I don't feel overly bad stopping the reader here */
+ archive_set_error(
+ &a->archive, EINVAL,
+ "Bad content length");
+ return (ARCHIVE_FATAL);
+ }
+
+ /* start off with the type */
+ ftyp = _warc_rdtyp(buf, eoh - buf);
+ /* and let future calls know about the content */
+ w->cntlen = cntlen;
+ w->cntoff = 0U;
+
+ switch (ftyp) {
+ case WT_RSRC:
+ case WT_RSP:
+ /* only try and read the filename in the cases that are
+ * guaranteed to have one */
+ fnam = _warc_rduri(buf, eoh - buf);
+ /* bang to our string pool, so we save a
+ * malloc()+free() roundtrip */
+ if (fnam.len + 1U > w->pool.len) {
+ w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
+ w->pool.str = realloc(w->pool.str, w->pool.len);
+ }
+ memcpy(w->pool.str, fnam.str, fnam.len);
+ w->pool.str[fnam.len] = '\0';
+ /* let noone else know about the pool, it's a secret, shhh */
+ fnam.str = w->pool.str;
+ break;
+ default:
+ fnam.len = 0U;
+ break;
+ }
+
+ /* now eat some of those delicious buffer bits */
+ __archive_read_consume(a, eoh - buf);
+
+ switch (ftyp) {
+ case WT_RSRC:
+ case WT_RSP:
+ if (fnam.len > 0U) {
+ /* populate entry object */
+ archive_entry_set_filetype(entry, AE_IFREG);
+ archive_entry_copy_pathname(entry, fnam.str);
+ archive_entry_set_size(entry, cntlen);
+ archive_entry_set_perm(entry, 0644);
+ break;
+ }
+ /*@fallthrough@*/
+ default:
+ /* consume the content and start over */
+ _warc_skip(a);
+ goto start_over;
+ }
+ return (ARCHIVE_OK);
+}
+
+static int
+_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
+{
+ struct warc_s *w = a->format->data;
+ const char *rab;
+ ssize_t nrd;
+
+ if (w->cntoff >= w->cntlen) {
+ eof:
+ /* it's our lucky day, no work, we can leave early */
+ *buf = NULL;
+ *bsz = 0U;
+ *off = w->cntoff;
+ return (ARCHIVE_EOF);
+ }
+
+ rab = __archive_read_ahead(a, 1U, &nrd);
+ if (nrd < 0) {
+ *bsz = 0U;
+ /* big catastrophe */
+ return (int)nrd;
+ } else if (nrd == 0) {
+ goto eof;
+ } else if (nrd > w->cntlen - w->cntoff) {
+ /* clamp to content-length */
+ nrd = w->cntlen - w->cntoff;
+ }
+ *off = w->cntoff;
+ *bsz = nrd;
+ *buf = rab;
+
+ w->cntoff += nrd;
+ return (ARCHIVE_OK);
+}
+
+static int
+_warc_skip(struct archive_read *a)
+{
+ struct warc_s *w = a->format->data;
+
+ __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
+ w->cntlen = 0U;
+ w->cntoff = 0U;
+ return (ARCHIVE_OK);
+}
+
+\f
+/* private routines */
+static void*
+deconst(const void *c)
+{
+ return (void*)0x1 + (c - (const void*)0x1);
+}
+
+static char*
+xmemmem(const char *haystack, size_t hz, const char *needle, size_t nz)
+{
+ const char *const eoh = haystack + hz;
+ const char *const eon = needle + nz;
+ const char *hp;
+ const char *np;
+ const char *cand;
+ unsigned int hsum;
+ unsigned int nsum;
+ unsigned int identicalp;
+
+ /* trivial checks first
+ * a 0-sized needle is defined to be found anywhere in haystack
+ * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
+ * that happens to begin with *NEEDLE) */
+ if (nz == 0UL) {
+ return deconst(haystack);
+ } else if ((haystack = memchr(haystack, *needle, hz)) == NULL) {
+ /* trivial */
+ return NULL;
+ }
+
+ /* First characters of haystack and needle are the same now. Both are
+ * guaranteed to be at least one character long. Now computes the sum
+ * of characters values of needle together with the sum of the first
+ * needle_len characters of haystack. */
+ for (hp = haystack + 1U, np = needle + 1U,
+ hsum = *haystack, nsum = *haystack,
+ identicalp = 1U;
+ hp < eoh && np < eon;
+ hsum += *hp, nsum += *np, identicalp = *hp == *np, hp++, np++);
+
+ /* HP now references the (NZ + 1)-th character. */
+ if (np < eon) {
+ /* haystack is smaller than needle, :O */
+ return NULL;
+ } else if (identicalp) {
+ /* found a match */
+ return deconst(haystack);
+ }
+
+ /* now loop through the rest of haystack,
+ * updating the sum iteratively */
+ for (cand = haystack; hp < eoh; hp++) {
+ hsum -= *cand++;
+ hsum += *hp;
+
+ /* Since the sum of the characters is already known to be
+ * equal at that point, it is enough to check just NZ - 1
+ * characters for equality,
+ * also CAND is by design < HP, so no need for range checks */
+ if (hsum == nsum && memcmp(cand, needle, nz - 1U) == 0) {
+ return deconst(cand);
+ }
+ }
+ return NULL;
+}
+
+static unsigned int
+_warc_rdver(const char buf[static 10U], size_t bsz)
+{
+ static const char magic[] = "WARC/";
+ unsigned int ver;
+
+ (void)bsz; /* UNUSED */
+
+ if (memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
+ /* nope */
+ return 99999U;
+ }
+ /* looks good so far, read the version number for a laugh */
+ buf += sizeof(magic) - 1U;
+ /* most common case gets a quick-check here */
+ if (memcmp(buf, "1.0\r\n", 5U) == 0) {
+ ver = 10000U;
+ } else {
+ switch (*buf) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ if (buf[1U] == '.') {
+ char *on;
+
+ /* set up major version */
+ ver = (buf[0U] - '0') * 10000U;
+ /* minor version, anyone? */
+ ver += (strtol(buf + 2U, &on, 10)) * 100U;
+ /* don't parse anything else */
+ if (on > buf + 2U) {
+ break;
+ }
+ }
+ /*@fallthrough@*/
+ case '9':
+ default:
+ /* just make the version ridiculously high */
+ ver = 999999U;
+ break;
+ }
+ }
+ return ver;
+}
+
+static unsigned int
+_warc_rdtyp(const char *buf, size_t bsz)
+{
+ static const char _key[] = "\r\nWARC-Type:";
+ const char *const eob = buf + bsz;
+ const char *val;
+
+ if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
+ /* no bother */
+ return WT_NONE;
+ }
+ /* overread whitespace */
+ for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++);
+
+ if (val + 8U > eob) {
+ ;
+ } else if (memcmp(val, "resource", 8U) == 0) {
+ return WT_RSRC;
+ } else if (memcmp(val, "warcinfo", 8U) == 0) {
+ return WT_INFO;
+ } else if (memcmp(val, "metadata", 8U) == 0) {
+ return WT_META;
+ } else if (memcmp(val, "request", 7U) == 0) {
+ return WT_REQ;
+ } else if (memcmp(val, "response", 8U) == 0) {
+ return WT_RSP;
+ } else if (memcmp(val, "conversi", 8U) == 0) {
+ return WT_CONV;
+ } else if (memcmp(val, "continua", 8U) == 0) {
+ return WT_CONT;
+ }
+ return WT_NONE;
+}
+
+static warc_string_t
+_warc_rduri(const char *buf, size_t bsz)
+{
+ static const char _key[] = "\r\nWARC-Target-URI:";
+ const char *const eob = buf + bsz;
+ const char *val;
+ const char *uri;
+ const char *eol;
+ warc_string_t res = {0U};
+
+ if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
+ /* no bother */
+ return res;
+ }
+ /* overread whitespace */
+ for (val += sizeof(_key) - 1U; val < eob && isspace(*val); val++);
+
+ /* overread URL designators */
+ if ((uri = xmemmem(val, eob - val, "://", 3U)) == NULL) {
+ /* not touching that! */
+ return res;
+ } else if ((eol = memchr(uri, '\n', eob - uri)) == NULL) {
+ /* no end of line? :O */
+ return res;
+ }
+
+ /* massage uri to point to after :// */
+ uri += 3U;
+ /* also massage eol to point to the first whitespace
+ * after the last non-whitespace character before
+ * the end of the line */
+ for (; eol > uri && isspace(eol[-1]); eol--);
+
+ /* now then, inspect the URI */
+ if (memcmp(val, "file", 4U) == 0) {
+ /* perfect, nothing left to do here */
+
+ } else if (memcmp(val, "http", 4U) == 0 ||
+ memcmp(val, "ftp", 3U) == 0) {
+ /* overread domain, and the first / */
+ while (uri < eol && *uri++ != '/');
+ } else {
+ /* not sure what to do? best to bugger off */
+ return res;
+ }
+ res.str = uri;
+ res.len = eol - uri;
+ return res;
+}
+
+static ssize_t
+_warc_rdlen(const char *buf, size_t bsz)
+{
+ static const char _key[] = "\r\nContent-Length:";
+ const char *val;
+ char *on = NULL;
+ long int len;
+
+ if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
+ /* no bother */
+ return -1;
+ }
+
+ /* strtol kindly overreads whitespace for us, so use that */
+ val += sizeof(_key) - 1U;
+ len = strtol(val, &on, 10);
+ if (on == NULL || !isspace(*on)) {
+ /* hm, can we trust that number? Best not. */
+ return -1;
+ }
+ return (size_t)len;
+}
+
+static const char*
+_warc_find_eoh(const char *buf, size_t bsz)
+{
+ static const char _marker[] = "\r\n\r\n";
+ const char *hit = memmem(buf, bsz, _marker, sizeof(_marker) - 1U);
+
+ if (hit != NULL) {
+ hit += sizeof(_marker) - 1U;
+ }
+ return hit;
+}
+
+/* archive_read_support_format_warc.c ends here */