From 667961d7ae5ab8e288f4d0309981a6b82b465edc Mon Sep 17 00:00:00 2001 From: Sebastian Freundt Date: Tue, 20 May 2014 08:19:38 +0000 Subject: [PATCH] Provide ISO 28500:2009 writer (aka warc, aka web archive) --- Makefile.am | 1 + libarchive/CMakeLists.txt | 1 + libarchive/archive.h | 2 + libarchive/archive_write_set_format.c | 1 + libarchive/archive_write_set_format_by_name.c | 1 + libarchive/archive_write_set_format_warc.c | 406 ++++++++++++++++++ 6 files changed, 412 insertions(+) create mode 100644 libarchive/archive_write_set_format_warc.c diff --git a/Makefile.am b/Makefile.am index 3f2bacae2..3f7aea108 100644 --- a/Makefile.am +++ b/Makefile.am @@ -212,6 +212,7 @@ libarchive_la_SOURCES= \ libarchive/archive_write_set_format_ustar.c \ libarchive/archive_write_set_format_v7tar.c \ libarchive/archive_write_set_format_gnutar.c \ + libarchive/archive_write_set_format_warc.c \ libarchive/archive_write_set_format_xar.c \ libarchive/archive_write_set_format_zip.c \ libarchive/archive_write_set_options.c \ diff --git a/libarchive/CMakeLists.txt b/libarchive/CMakeLists.txt index 480cea1aa..6d5222650 100644 --- a/libarchive/CMakeLists.txt +++ b/libarchive/CMakeLists.txt @@ -132,6 +132,7 @@ SET(libarchive_SOURCES archive_write_set_format_shar.c archive_write_set_format_ustar.c archive_write_set_format_v7tar.c + archive_write_set_format_warc.c archive_write_set_format_xar.c archive_write_set_format_zip.c archive_write_set_options.c diff --git a/libarchive/archive.h b/libarchive/archive.h index e90cb4931..c88673293 100644 --- a/libarchive/archive.h +++ b/libarchive/archive.h @@ -289,6 +289,7 @@ typedef int archive_switch_callback(struct archive *, void *_client_data1, #define ARCHIVE_FORMAT_CAB 0xC0000 #define ARCHIVE_FORMAT_RAR 0xD0000 #define ARCHIVE_FORMAT_7ZIP 0xE0000 +#define ARCHIVE_FORMAT_WARC 0xF0000 /* * Codes returned by archive_read_format_capabilities(). @@ -738,6 +739,7 @@ __LA_DECL int archive_write_set_format_shar(struct archive *); __LA_DECL int archive_write_set_format_shar_dump(struct archive *); __LA_DECL int archive_write_set_format_ustar(struct archive *); __LA_DECL int archive_write_set_format_v7tar(struct archive *); +__LA_DECL int archive_write_set_format_warc(struct archive *); __LA_DECL int archive_write_set_format_xar(struct archive *); __LA_DECL int archive_write_set_format_zip(struct archive *); __LA_DECL int archive_write_zip_set_compression_deflate(struct archive *); diff --git a/libarchive/archive_write_set_format.c b/libarchive/archive_write_set_format.c index 9055753b2..744302d06 100644 --- a/libarchive/archive_write_set_format.c +++ b/libarchive/archive_write_set_format.c @@ -57,6 +57,7 @@ struct { int code; int (*setter)(struct archive *); } codes[] = { ARCHIVE_FORMAT_TAR_PAX_RESTRICTED, archive_write_set_format_pax_restricted }, { ARCHIVE_FORMAT_TAR_USTAR, archive_write_set_format_ustar }, + { ARCHIVE_FORMAT_WARC, archive_write_set_format_warc }, { ARCHIVE_FORMAT_XAR, archive_write_set_format_xar }, { ARCHIVE_FORMAT_ZIP, archive_write_set_format_zip }, { 0, NULL } diff --git a/libarchive/archive_write_set_format_by_name.c b/libarchive/archive_write_set_format_by_name.c index 4f3ce7d35..a2ce7c6cd 100644 --- a/libarchive/archive_write_set_format_by_name.c +++ b/libarchive/archive_write_set_format_by_name.c @@ -70,6 +70,7 @@ struct { const char *name; int (*setter)(struct archive *); } names[] = { "ustar", archive_write_set_format_ustar }, { "v7tar", archive_write_set_format_v7tar }, { "v7", archive_write_set_format_v7tar }, + { "warc", archive_write_set_format_warc }, { "xar", archive_write_set_format_xar }, { "zip", archive_write_set_format_zip }, { NULL, NULL } diff --git a/libarchive/archive_write_set_format_warc.c b/libarchive/archive_write_set_format_warc.c new file mode 100644 index 000000000..6c7b8c7ee --- /dev/null +++ b/libarchive/archive_write_set_format_warc.c @@ -0,0 +1,406 @@ +/*- + * Copyright (c) 2014 Sebastian Freundt + * Author: Sebastian Freundt + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "archive_platform.h" +__FBSDID("$FreeBSD$"); + +#ifdef HAVE_ERRNO_H +#include +#endif +#include +#ifdef HAVE_STDLIB_H +#include +#endif +#ifdef HAVE_STRING_H +#include +#endif +#ifdef HAVE_TIME_H +#include +#endif + +#include "archive.h" +#include "archive_entry.h" +#include "archive_entry_locale.h" +#include "archive_private.h" +#include "archive_write_private.h" + +struct warc_s { + unsigned int omit_warcinfo:1; + + time_t now; + mode_t typ; + unsigned int rng; +}; + +static const char warcinfo[] = "\ +software: " PACKAGE_NAME "/" PACKAGE_VERSION "\r\n\ +format: WARC file version 1.0\r\n"; + +typedef enum { + WT_NONE, + /* warcinfo */ + WT_INFO, + /* metadata */ + WT_META, + /* resource */ + WT_RSRC, + /* request, unsupported */ + WT_REQ, + /* response, unsupported */ + WT_RSP, + /* revisit, unsupported */ + WT_RVIS, + /* conversion, unsupported */ + WT_CONV, + /* continutation, unsupported at the moment */ + WT_CONT, + /* invalid type */ + LAST_WT +} warc_type_t; + +typedef struct { + warc_type_t type; + const char *tgturi; + const char *recid; + time_t rectim; + const char *cnttyp; + size_t cntlen; +} warc_essential_hdr_t; + +typedef struct { + unsigned int u[4U]; +} warc_uuid_t; + +static int _warc_options(struct archive_write*, const char *key, const char *v); +static int _warc_header(struct archive_write *a, struct archive_entry *entry); +static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz); +static int _warc_finish_entry(struct archive_write *a); +static int _warc_close(struct archive_write *a); +static int _warc_free(struct archive_write *a); + +/* private routines */ +static ssize_t _popul_ehdr(char *t, size_t z, warc_essential_hdr_t); +static int _gen_uuid(warc_uuid_t tgt[static 1U]); + + +/* + * Set output format to ISO 28500 (aka WARC) format. + */ +int +archive_write_set_format_warc(struct archive *_a) +{ + struct archive_write *a = (struct archive_write *)_a; + struct warc_s *w; + + archive_check_magic(_a, ARCHIVE_WRITE_MAGIC, + ARCHIVE_STATE_NEW, "archive_write_set_format_warc"); + + /* If another format was already registered, unregister it. */ + if (a->format_free != NULL) { + (a->format_free)(a); + } + + w = malloc(sizeof(*w)); + if (w == NULL) { + archive_set_error(&a->archive, ENOMEM, + "Can't allocate warc data"); + return (ARCHIVE_FATAL); + } + /* by default we're emitting a file wide header */ + w->omit_warcinfo = 0U; + /* obtain current time for date fields */ + w->now = time(NULL); + /* reset file type info */ + w->typ = 0; + /* also initialise our rng */ + w->rng = (unsigned int)w->now; + srand(w->rng); + + a->format_data = w; + a->format_name = "WARC/1.0"; + a->format_options = _warc_options; + a->format_write_header = _warc_header; + a->format_write_data = _warc_data; + a->format_close = _warc_close; + a->format_free = _warc_free; + a->format_finish_entry = _warc_finish_entry; + a->archive.archive_format = ARCHIVE_FORMAT_WARC; + a->archive.archive_format_name = "WARC/1.0"; + return (ARCHIVE_OK); +} + + +/* archive methods */ +static int +_warc_options(struct archive_write *a, const char *key, const char *val) +{ + struct warc_s *w = a->format_data; + + if (strcmp(key, "omit-warcinfo") == 0) { + if (val == NULL || strcmp(val, "true") == 0) { + /* great */ + w->omit_warcinfo = 1U; + return (ARCHIVE_OK); + } + } + + /* Note: The "warn" return is just to inform the options + * supervisor that we didn't handle it. It will generate + * a suitable error if no one used this option. */ + return (ARCHIVE_WARN); +} + +static int +_warc_header(struct archive_write *a, struct archive_entry *entry) +{ + struct warc_s *w = a->format_data; + char hdr[512U]; + + /* check whether warcinfo record needs outputting */ + if (!w->omit_warcinfo) { + warc_essential_hdr_t wi = { + WT_INFO, + /*uri*/NULL, + /*urn*/NULL, + /*tim*/w->now, + /*cty*/"application/warc-fields", + /*len*/sizeof(warcinfo) - 1U, + }; + ssize_t r; + + r = _popul_ehdr(hdr, sizeof(hdr), wi); + if (r >= 0) { + /* jackpot! */ + /* now also use HDR buffer for the actual warcinfo */ + memcpy(hdr + r, warcinfo, sizeof(warcinfo)); + r += sizeof(warcinfo) - 1U; + + /* append end-of-record indicator */ + hdr[r++] = '\r'; + hdr[r++] = '\n'; + hdr[r++] = '\r'; + hdr[r++] = '\n'; + + /* write to output stream */ + __archive_write_output(a, hdr, r); + } + /* indicate we're done with file header writing */ + w->omit_warcinfo = 1U; + } + + w->typ = archive_entry_filetype(entry); + if (w->typ == AE_IFREG) { + warc_essential_hdr_t rh = { + WT_RSRC, + /*uri*/archive_entry_pathname(entry), + /*urn*/NULL, + /*tim*/w->now, + /*cty*/NULL, + /*len*/archive_entry_size(entry), + }; + ssize_t r; + + r = _popul_ehdr(hdr, sizeof(hdr), rh); + if (r < 0) { + /* don't bother */ + archive_set_error( + &a->archive, + ARCHIVE_ERRNO_FILE_FORMAT, + "cannot archive file"); + return (ARCHIVE_WARN); + } + /* otherwise append to output stream */ + __archive_write_output(a, hdr, r); + } + /* just pretend it's all good */ + return (ARCHIVE_OK); +} + +static ssize_t +_warc_data(struct archive_write *a, const void *buf, size_t len) +{ + struct warc_s *w = a->format_data; + + if (w->typ == AE_IFREG) { + int rc = __archive_write_output(a, buf, len); + + if (rc != ARCHIVE_OK) { + return rc; + } + } + return len; +} + +static int +_warc_finish_entry(struct archive_write *a) +{ + static const char _eor[] = "\r\n\r\n"; + struct warc_s *w = a->format_data; + + if (w->typ == AE_IFREG) { + int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U); + + if (rc != ARCHIVE_OK) { + return rc; + } + } + /* reset type info */ + w->typ = 0; + return (ARCHIVE_OK); +} + +static int +_warc_close(struct archive_write *a) +{ + (void)a; /* UNUSED */ + return (ARCHIVE_OK); +} + +static int +_warc_free(struct archive_write *a) +{ + struct warc_s *w = a->format_data; + + free(w); + a->format_data = NULL; + return (ARCHIVE_OK); +} + + +/* private routines */ +#define XNPRINTF(x, z, args...) \ + do { \ + int __r = snprintf(x, z, args); \ + if (__r < 0) { \ + return -1; \ + } \ + x += __r; \ + } while (0) + +static ssize_t +_popul_ehdr(char *tgt, size_t tsz, warc_essential_hdr_t hdr) +{ + static const char _ver[] = "WARC/1.0\r\n"; + static const char *_typ[LAST_WT] = { + NULL, "warcinfo", "metadata", "resource", NULL + }; + char std_uuid[48U]; + char *tp = tgt; + const char *const ep = tgt + tsz; + + if (hdr.type == WT_NONE || hdr.type > WT_RSRC) { + /* brilliant, how exactly did we get here? */ + return -1; + } + + memcpy(tp, _ver, sizeof(_ver) - 1U); + tp += sizeof(_ver) - 1U; + + XNPRINTF(tp, ep - tp, "WARC-Type: %s\r\n", _typ[hdr.type]); + + if (hdr.tgturi != NULL) { + /* check if there's a xyz:// */ + static const char _uri[] = ""; + static const char _fil[] = "file://"; + const char *u; + char *chk = strchr(hdr.tgturi, ':'); + + if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') { + /* yep, it's definitely a URI */ + u = _uri; + } else { + /* hm, best to prepend file:// then */ + u = _fil; + } + XNPRINTF( + tp, ep - tp, + "WARC-Target-URI: %s%s\r\n", u, hdr.tgturi); + } + + /* we could essentially put one of mtime/ctime/atime here */ + { + struct tm *rt; + + rt = gmtime(&hdr.rectim); + if (rt == NULL) { + return -1; + } + /* next one can't fail, + * at least it won't set TP out of its bounds */ + tp += strftime(tp, ep - tp, "WARC-Date: %FT%H:%M:%SZ\r\n", rt); + } + if (hdr.recid == NULL) { + /* generate one, grrrr */ + warc_uuid_t u; + + _gen_uuid(&u); + snprintf( + std_uuid, sizeof(std_uuid), + "", + u.u[0U], + u.u[1U] >> 16U, u.u[1U] & 0xffffU, + u.u[2U] >> 16U, u.u[2U] & 0xffffU, + u.u[3U]); + hdr.recid = std_uuid; + } + + /* record-id is mandatory, fingers crossed we won't fail */ + XNPRINTF(tp, ep - tp, "WARC-Record-ID: %s\r\n", hdr.recid); + + if (hdr.cnttyp != NULL) { + XNPRINTF(tp, ep - tp, "Content-Type: %s\r\n", hdr.cnttyp); + } + + /* next one is mandatory */ + XNPRINTF(tp, ep - tp, "Content-Length: %zu\r\n", hdr.cntlen); + + if (tp + 2U >= ep) { + /* doesn't fit */ + return -1; + } + + *tp++ = '\r'; + *tp++ = '\n'; + return tp - tgt; +} + +static int +_gen_uuid(warc_uuid_t tgt[static 1U]) +{ + tgt->u[0U] = (unsigned int)rand(); + tgt->u[1U] = (unsigned int)rand(); + tgt->u[2U] = (unsigned int)rand(); + tgt->u[3U] = (unsigned int)rand(); + /* obey uuid version 4 rules */ + tgt->u[1U] &= 0xffff0fffU; + tgt->u[1U] |= 0x4000U; + tgt->u[2U] &= 0x3fffffffU; + tgt->u[2U] |= 0x80000000U; + return 0; +} + +/* archive_write_set_format_warc.c ends here */ -- 2.47.2