naoki: Initial import of python rewrite.

author Michael Tremer <michael.tremer@ipfire.org>

Tue, 29 Dec 2009 17:00:27 +0000 (18:00 +0100)

committer Michael Tremer <michael.tremer@ipfire.org>

Tue, 29 Dec 2009 17:01:01 +0000 (18:01 +0100)
author Michael Tremer <michael.tremer@ipfire.org>
Tue, 29 Dec 2009 17:00:27 +0000 (18:00 +0100)
committer Michael Tremer <michael.tremer@ipfire.org>
Tue, 29 Dec 2009 17:01:01 +0000 (18:01 +0100)
diff --git a/config/logging.ini b/config/logging.ini

new file mode 100644 (file)

index 0000000..20c2e38
--- /dev/null
+++ b/config/logging.ini
@@ -0,0 +1,83 @@
+[formatters]
+keys: detailed,simple,unadorned,state
+ 
+[handlers]
+keys: simple_console,detailed_console,unadorned_console,simple_console_warnings_only
+ 
+[loggers]
+keys: root,build,state,naoki
+ 
+[formatter_state]
+format: %(asctime)s - %(message)s
+
+[formatter_unadorned]
+format: %(message)s
+ 
+[formatter_simple]
+format: %(levelname)s: %(message)s
+ 
+;useful for debugging:
+[formatter_detailed]
+format: %(levelname)s %(filename)s:%(lineno)d:  %(message)s
+ 
+[handler_unadorned_console]
+class: StreamHandler
+args: []
+formatter: unadorned
+level: INFO
+
+[handler_simple_console]
+class: StreamHandler
+args: []
+formatter: simple
+level: INFO
+
+[handler_simple_console_warnings_only]
+class: StreamHandler
+args: []
+formatter: simple
+level: WARNING
+
+[handler_detailed_console]
+class: StreamHandler
+args: []
+formatter: detailed
+level: WARNING
+
+; usually dont want to set a level for loggers
+; this way all handlers get all messages, and messages can be filtered
+; at the handler level
+;
+; all these loggers default to a console output handler
+; 
+[logger_root]
+level: NOTSET
+handlers: simple_console
+
+; naoki logger normally has no output
+;  catches stuff like naoki.trace_decorator and naoki.util
+;  dont normally want to propagate to root logger, either
+[logger_naoki]
+level: NOTSET
+handlers: 
+qualname: naoki
+propagate: 1
+
+[logger_state]
+level: NOTSET
+; unadorned_console only outputs INFO or above
+handlers: unadorned_console
+qualname: naoki.Root.state
+propagate: 0
+
+[logger_build]
+level: NOTSET
+handlers: simple_console_warnings_only
+qualname: naoki.Root.build
+propagate: 0
+
+; the following is a list naoki logger qualnames used within the code:
+; 
+;  qualname: naoki.util
+;  qualname: naoki.uid
+;  qualname: naoki.trace_decorator
diff --git a/config/naoki.conf b/config/naoki.conf

new file mode 100644 (file)

index 0000000..c5701e2
--- /dev/null
+++ b/config/naoki.conf
@@ -0,0 +1,14 @@
+[distro]
+
+; Name of the distribution we build.
+name = IPFire
+sname = ipfire
+
+; Major version number
+epoch = 3
+
+; Minor version string
+version = %(epoch)s.0-prealpha2
+
+; A descriptive slogan
+slogan = "Gluttony"
diff --git a/make.sh b/make.sh

index 825f971837423f35676f59a4098a61a3cd799f5a..cddabc74d7f38adacc38690b0a2e15e14b92d43c 100755 (executable)
--- a/make.sh
+++ b/make.sh
@@ -1,125 +1,44 @@
-#!/bin/bash
-###############################################################################
-#                                                                             #
-# IPFire.org - A linux based firewall                                         #
-# Copyright (C) 2007, 2008, 2009 Michael Tremer & Christian Schmidt           #
-#                                                                             #
-# This program is free software: you can redistribute it and/or modify        #
-# it under the terms of the GNU General Public License as published by        #
-# the Free Software Foundation, either version 3 of the License, or           #
-# (at your option) any later version.                                         #
-#                                                                             #
-# This program is distributed in the hope that it will be useful,             #
-# but WITHOUT ANY WARRANTY; without even the implied warranty of              #
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
-# GNU General Public License for more details.                                #
-#                                                                             #
-# You should have received a copy of the GNU General Public License           #
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
-#                                                                             #
-###############################################################################
-#
-
-BASEDIR=/ipfire-3.x
-
-. ${BASEDIR}/tools/common-include
-
-while [ $# -gt 0 ]; do
-       case "${1}" in
-               --debug|-d)
-                       DEBUG=1
-                       log DEBUG "Debugging mode enabled by command line."
-                       ;;
-               --toolchain)
-                       TOOLCHAIN=1
-                       log DEBUG "Toolchain mode enabled by command line."
-                       ;;
-               *)
-                       action=${1}
-                       shift
-                       break
-                       ;;
-       esac
-       shift
-done
-
-export DEBUG TOOLCHAIN
-
-function package() {
-       local action=${1}
-       shift
-
-       case "${action}" in
-               dependencies|deps)
-                       echo -e "${BOLD}Build dependencies:${NORMAL} $(package_build_dependencies $@ | tr '\n' ' ')"
-                       echo -e "${BOLD}Dependencies:${NORMAL}       $(package_runtime_dependencies $@ | tr '\n' ' ')"
-                       ;;
-               find)
-                       find_package $@
-                       ;;
-               is_built)
-                       if package_is_built $(find_package $@); then
-                               echo "Package is built."
-                               return 0
-                       else
-                               echo "Package is NOT built."
-                               return 1
-                       fi
-                       ;;
-               list)
-                       package_list
-                       ;;
-               packages)
-                       package_packages $(find_package $@)
-                       ;;
-               profile|info)
-                       package_profile $(find_package $@)
-                       ;;
-               _info)
-                       package_info $(find_package $@)
-                       ;;
-       esac
-}
-
-case "${action}" in
-       all)
-               for pkg in $(${NAOKI} tree); do
-                       echo "${pkg}:"
-                       package is_built ${pkg} && continue
-                       ${NAOKI} build ${pkg} || break
-               done
-               ;;
-       build)
-               ${NAOKI} build $@
-               ;;
-       package|pkg)
-               package $@
-               ;;
-       toolchain)
-               TOOLCHAIN=1
-               ${NAOKI} --toolchain tree
-               ;;
-       toolchain_build)
-               for i in $($0 toolchain); do
-                       ${NAOKI} --toolchain toolchain ${i}
-               done
-               ;;
-       tree)
-               ${NAOKI} tree
-               ;;
-       random)
-               pkgs=$(package_list)
-               while true; do
-                       if [ -z "${pkgs}" ]; then
-                               break
-                       fi
-
-                       pkgs=$(package_random ${pkgs})
-                       pkg=$(awk '{print $NF }' <<<${pkgs})
-
-                       ${NAOKI} build ${pkg}
-
-                       pkgs=$(listremove ${pkg} ${pkgs})
-               done
-               ;;
-esac
+#!/usr/bin/python
+
+import sys
+from optparse import OptionParser
+
+import naoki
+
+op = OptionParser()
+
+# verbosity
+op.add_option("-v", "--verbose", action="store_const", const=2,
+                               dest="verbose", default=1, help="verbose build")
+op.add_option("-q", "--quiet", action="store_const", const=0,
+                               dest="verbose", help="quiet build")
+
+# modes (basic commands)
+op.add_option("--download", action="store_const", const="download",
+                               dest="mode", help="download files")
+op.add_option("--build", "--rebuild", action="store_const",
+                               const="rebuild", dest="mode", default='rebuild',
+                               help="rebuild the specified packages")
+op.add_option("--info", action="store_const", const="info", dest="mode",
+                               help="return some info about the specified packages")
+op.add_option("--list-packages", action="store_const", const="list-packages",
+                               dest="mode", help="list all packages")
+op.add_option("--list-groups", action="store_const", const="list-groups",
+                               dest="mode", help="list all groups")
+op.add_option("--list-tree", action="store_const", const="list-tree",
+                               dest="mode", help="list the dependency tree")
+
+n = naoki.Naoki(op)
+exitStatus = 0
+
+try:
+       n.action()
+
+except (SystemExit,):
+       raise
+
+except (KeyboardInterrupt,):
+       exitStatus = 7
+       n.log.error("Exiting on user interrupt, <CTRL>-C")
+
+sys.exit(exitStatus)
diff --git a/naoki/__init__.py b/naoki/__init__.py

new file mode 100644 (file)

index 0000000..524add4
--- /dev/null
+++ b/naoki/__init__.py
@@ -0,0 +1,146 @@
+#!/usr/bin/python
+
+import ConfigParser
+import curses
+import logging
+import logging.config
+import logging.handlers
+import os.path
+import sys
+import time
+
+import logger
+import package
+import util
+
+from constants import *
+
+class Naoki(object):
+       packages = []
+
+       def __init__(self, op):
+               (self.options, self.args) = op.parse_args()
+
+               # set up basic logging until config file can be read
+               logging.basicConfig(format="%(levelname)s: %(message)s",
+                       level=logging.WARNING)
+               self.log = logging.getLogger()
+
+               self.config = config
+               self.setup_logging()
+               
+               self.log.info("Started naoki on %s" % time.strftime("%a, %d %b %Y %H:%M:%S"))
+               
+               # dump configuration to log
+               self.log.debug("Configuration:")
+               for k, v in self.config.items():
+                       self.log.debug("    %s:  %s" % (k, v))
+
+       def setup_logging(self):
+               log_ini = self.config["log_config_file"]
+               if os.path.exists(log_ini):
+                       logging.config.fileConfig(log_ini)
+
+               if sys.stderr.isatty():
+                       curses.setupterm()
+                       self.log.handlers[0].setFormatter(logger._ColorLogFormatter())
+
+               if self.options.verbose == 0:
+                       self.log.handlers[0].setLevel(logging.WARNING)
+               elif self.options.verbose == 1:
+                       self.log.handlers[0].setLevel(logging.INFO)
+               elif self.options.verbose == 2:
+                       self.log.handlers[0].setLevel(logging.DEBUG)
+                       logging.getLogger("naoki").propagate = 1
+
+               fh = logging.handlers.RotatingFileHandler(self.config["log_file"],
+                       maxBytes=1073741824, backupCount=6)
+               fh.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
+               fh.setLevel(logging.NOTSET)
+               self.log.addHandler(fh)
+
+       def addPackage(self, package):
+               self.log.debug("Add package: %s" % repr(package))
+               self.packages.append(package)
+
+       def action(self, action=None):
+               if not action:
+                       action = self.options.mode
+
+               # Parse all package names
+               for pkg_name in self.args:
+                       pkg = package.find(pkg_name)
+                       if not pkg:
+                               self.log.warn("Not a package: %s" % pkg_name)
+                               continue
+                       self.addPackage(pkg)
+
+               if action == "download":
+                       if not self.packages:
+                               self.packages = package.list()
+                       for pkg in self.packages:
+                               pkg.download()
+
+               elif action == "info":
+                       for pkg in self.packages:
+                               print pkg.info
+
+               elif action == "list-packages":
+                       for pkg in package.list():
+                               print "%s" % pkg
+               
+               elif action == "list-groups":
+                       print "\n".join(naoki.package.groups())
+
+               elif action == "rebuild":
+                       self.build()
+
+       def build(self):
+               requeue = []
+               while True:
+                       if not self.packages:
+                               return
+
+                       # Get first package that is to be done
+                       build = Build(self.packages.pop(0))
+                       
+                       if build.package.isBuilt:
+                               self.log.warn("Package is already built. Will overwrite.")
+
+                       self.log.info("Building %s..." % build.package.name)
+                       build.build()
+
+
+class Build(object):
+       def __init__(self, package):
+               self.package = package
+
+               self.environment = chroot.Environment(self.package)
+
+       def init(self):
+               self.environment.init()
+
+               self.extractAll()
+       
+       def extractAll(self):
+               packages = self.package.deps + self.package.build_deps
+               for pkg in config["mandatory_packages"]:
+                       pkg = package.find(pkg)
+                       if not pkg in packages:
+                               packages.append(pkg)
+
+               packages = package.depsolve(packages, recursive=True)
+
+               for pkg in packages:
+                       pkg.extract(self.environment.chrootPath())
+
+       def build(self):
+               self.package.download()
+               self.init()
+               self.make("package")
+
+       def make(self, target):
+               file = self.package.filename.replace(BASEDIR, "/usr/src")
+               cmd = "make --no-print-directory -C %s -f %s %s" % (os.path.dirname(file),
+                       file, target,)
+               self.environment.doChroot(cmd)
diff --git a/naoki/chroot.py b/naoki/chroot.py

new file mode 100644 (file)

index 0000000..8697c65
--- /dev/null
+++ b/naoki/chroot.py
@@ -0,0 +1,235 @@
+#!/usr/bin/python
+
+import grp
+import logging
+import os
+import random
+import stat
+
+import util
+from constants import *
+from logger import getLog
+
+class Environment(object):
+       def __init__(self, package):
+               self.package = package
+               self.config = config
+
+               # mount/umount
+               self.umountCmds = [
+                               "umount -n %s" % self.chrootPath("proc"),
+                               "umount -n %s" % self.chrootPath("sys"),
+                               "umount -n %s" % self.chrootPath("usr", "src", "cache"),
+                               "umount -n %s" % self.chrootPath("usr", "src", "packages"),
+                               "umount -n %s" % self.chrootPath("usr", "src", "pkgs"),
+                               "umount -n %s" % self.chrootPath("usr", "src", "src"),
+                               "umount -n %s" % self.chrootPath("usr", "src", "tools"),
+               ]
+               self.mountCmds = [
+                               "mount -n -t proc naoki_chroot_proc %s" % self.chrootPath("proc"),
+                               "mount -n -t sysfs naoki_chroot_sysfs %s" % self.chrootPath("sys"),
+                               "mount -n --bind -o ro %s %s" % (os.path.join(CACHEDIR), self.chrootPath("usr", "src", "cache")),
+                               "mount -n --bind -o ro %s %s" % (os.path.join(PACKAGESDIR), self.chrootPath("usr", "src", "packages")),
+                               "mount -n --bind -o ro %s %s" % (os.path.join(PKGSDIR), self.chrootPath("usr", "src", "pkgs")),
+                               "mount -n --bind -o ro %s %s" % (os.path.join(BASEDIR, "src"), self.chrootPath("usr", "src", "src")),
+                               "mount -n --bind -o ro %s %s" % (os.path.join(TOOLSDIR), self.chrootPath("usr", "src", "tools")),
+               ]
+
+               self.buildroot = "buildroot.%d" % random.randint(0, 1024)
+               self.log = None
+               self.__initialized = False
+
+       def init(self):
+               if self.__initialized:
+                       return
+               try:
+                       self._init()
+               except (KeyboardInterrupt, Exception):
+                       #self._callHooks('initfailed')
+                       raise
+               self.__initialized = True
+
+       def _init(self):
+               self._setupLogging()
+
+               # create dirs
+               self.log.debug("Creating directories...")
+               dirs = (
+                       CACHEDIR,
+                       PACKAGESDIR,
+                       self.chrootPath(self.buildroot),
+                       self.chrootPath("bin"),
+                       self.chrootPath("etc"),
+                       self.chrootPath("proc"),
+                       self.chrootPath("root"),
+                       self.chrootPath("sbin"),
+                       self.chrootPath("sys"),
+                       self.chrootPath("tmp"),
+                       self.chrootPath("usr/src/cache"),
+                       self.chrootPath("usr/src/packages"),
+                       self.chrootPath("usr/src/pkgs"),
+                       self.chrootPath("usr/src/src"),
+                       self.chrootPath("usr/src/tools"),
+                       self.chrootPath("var/tmp"),
+               )
+               for item in dirs:
+                       util.mkdir(item)
+
+               # touch files
+               self.log.debug("Touching files...")
+               files = (
+                       "etc/fstab",
+                       "etc/mtab",
+               )
+               for item in files:
+                       util.touch(self.chrootPath(item))
+               
+               self._setupDev()
+               self._setupUsers()
+               self._setupToolchain()
+       
+       def clean(self):
+               pass
+       
+       def make(self, target):
+               file = self.package.filename.replace(BASEDIR, "/usr/src")
+               try:
+                       self._mountall()
+                       self.doChroot("make --no-print-directory -C %s -f %s %s" % \
+                               (os.path.dirname(file), file, target), shell=True)
+               finally:
+                       self._umountall()
+
+       def doChroot(self, command, shell=True, *args, **kwargs):
+               ret = None
+               try:
+                       # XXX Should be globally defined
+                       env = config.environment.copy()
+                       env.update({
+                               "HOME"           : "/root",
+                               "PATH"           : "/sbin:/bin:/usr/sbin:/usr/bin:/tools_i686/sbin:/tools_i686/bin",
+                               "TERM"           : os.environ["TERM"],
+                               "PS1"            : os.environ.get("PS1", "\u:\w\$ "),
+                               "BASEDIR"        : "/usr/src",
+                               "BUILDROOT"      : "/%s" % self.buildroot,
+                               "PKGROOT"        : "/usr/src/pkgs",
+                               "CHROOT"         : "1",
+                       })
+
+                       if kwargs.has_key("env"):
+                               env.update(kwargs.pop("env"))
+
+                       self._mountall()
+
+                       ret = util.do(command, chrootPath=self.chrootPath(),
+                               shell=shell, env=env, logger=self.log, *args, **kwargs)
+
+               finally:
+                       self._umountall()
+               
+               return ret
+
+       def chrootPath(self, *args):
+               return os.path.join(BUILDDIR, "environments", self.package.id, *args)
+
+       def _setupLogging(self):
+               logfile = os.path.join(LOGDIR, self.package.id, "build.log")
+               if not os.path.exists(os.path.dirname(logfile)):
+                       util.mkdir(os.path.dirname(logfile))
+               self.log = logging.getLogger(self.package.id)
+               fh = logging.FileHandler(logfile)
+               fh.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
+               fh.setLevel(logging.NOTSET)
+               self.log.addHandler(fh)
+
+       def _setupDev(self):
+               # files in /dev
+               util.rm(self.chrootPath("dev"))
+               util.mkdir(self.chrootPath("dev", "pts"))
+               util.mkdir(self.chrootPath("dev", "shm"))
+               prevMask = os.umask(0000)
+               for i in (
+                               (stat.S_IFCHR | 0666, os.makedev(1, 3), "dev/null"),
+                               (stat.S_IFCHR | 0666, os.makedev(1, 7), "dev/full"),
+                               (stat.S_IFCHR | 0666, os.makedev(1, 5), "dev/zero"),
+                               (stat.S_IFCHR | 0666, os.makedev(1, 8), "dev/random"),
+                               (stat.S_IFCHR | 0444, os.makedev(1, 9), "dev/urandom"),
+                               (stat.S_IFCHR | 0666, os.makedev(5, 0), "dev/tty"),
+                               (stat.S_IFCHR | 0600, os.makedev(5, 1), "dev/console")
+                       ):
+                       # create node
+                       os.mknod(self.chrootPath(i[2]), i[0], i[1])
+
+               os.symlink("/proc/self/fd/0", self.chrootPath("dev", "stdin"))
+               os.symlink("/proc/self/fd/1", self.chrootPath("dev", "stdout"))
+               os.symlink("/proc/self/fd/2", self.chrootPath("dev", "stderr"))
+               os.symlink("/dev/pts/ptmx", self.chrootPath("dev", "ptmx"))
+               os.umask(prevMask)
+
+               # mount/umount
+               for devUnmtCmd in (
+                               "umount -n %s" % self.chrootPath("dev", "pts"),
+                               "umount -n %s" % self.chrootPath("dev", "shm")):
+                       if devUnmtCmd not in self.umountCmds:
+                               self.umountCmds.append(devUnmtCmd)
+
+               mountopt = "gid=%d,mode=0620,ptmxmode=0666" % grp.getgrnam("tty").gr_gid
+               if os.uname()[2] >= "2.6.29":
+                       mountopt += ",newinstance"
+
+               for devMntCmd in (
+                               "mount -n -t devpts -o %s naoki_chroot_devpts %s" % (mountopt, self.chrootPath("dev", "pts")),
+                               "mount -n -t tmpfs naoki_chroot_shmfs %s" % self.chrootPath("dev", "shm")):
+                       if devMntCmd not in self.mountCmds:
+                               self.mountCmds.append(devMntCmd)
+
+       def _setupUsers(self):
+               ## XXX Could be done better
+               self.log.debug("Creating users")
+               f = open("/etc/passwd")
+               for line in f.readlines():
+                       if line.startswith("root"):
+                               g = open(self.chrootPath("etc", "passwd"), "w")
+                               g.write("%s" % line)
+                               g.close()
+                               break
+               f.close()
+
+               f = open("/etc/group")
+               for line in f.readlines():
+                       if line.startswith("root"):
+                               g = open(self.chrootPath("etc", "group"), "w")
+                               g.write("%s" % line)
+                               g.close()
+                               break
+               f.close()
+
+       def _setupToolchain(self):
+               if os.path.exists(self.chrootPath("tools_i686")):
+                       return
+               
+               self.log.debug("Extracting toolchain...")
+
+               util.do("tar xfz %s -C %s" % (TOOLCHAIN_TARBALL, self.chrootPath()),
+                       shell=True)
+               
+               symlinks = (
+                       "bin/bash",
+                       "bin/sh",
+                       "bin/pwd",
+               )
+               for symlink in symlinks:
+                       if os.path.exists(self.chrootPath(symlink)):
+                               continue
+                       self.log.debug("Creating symlink /%s" % symlink)
+                       os.symlink("/tools_i686/%s" % symlink, self.chrootPath(symlink))
+
+       def _mountall(self):
+               """mount 'normal' fs like /dev/ /proc/ /sys"""
+               for cmd in self.mountCmds:
+                       util.do(cmd, shell=True)
+
+       def _umountall(self):
+               """umount all mounted chroot fs."""
+               for cmd in self.umountCmds:
+                       util.do(cmd, raiseExc=0, shell=True)
diff --git a/naoki/constants.py b/naoki/constants.py

new file mode 100644 (file)

index 0000000..a0f1ec9
--- /dev/null
+++ b/naoki/constants.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python
+
+import ConfigParser
+import os
+
+BASEDIR = os.getcwd()
+
+BUILDDIR = os.path.join(BASEDIR, "build")
+CACHEDIR = os.path.join(BASEDIR, "cache")
+CONFIGDIR = os.path.join(BASEDIR, "config")
+LOGDIR = os.path.join(BASEDIR, "logs")
+PKGSDIR = os.path.join(BASEDIR, "pkgs")
+PACKAGESDIR = os.path.join(BUILDDIR, "packages")
+TOOLSDIR = os.path.join(BASEDIR, "tools")
+
+TARBALLDIR = os.path.join(CACHEDIR, "tarballs")
+PATCHESDIR = os.path.join(CACHEDIR, "patches")
+
+CONFIGFILE = os.path.join(CONFIGDIR, "naoki.conf")
+
+TOOLCHAIN_TARBALL = os.path.join(BUILDDIR, "tools_i686.tar.gz")
+
+class Config(object):
+       _items = {
+               "mandatory_packages" : [
+                       "core/bash",
+                       "core/gcc",
+                       "core/glibc",
+                       "core/make",
+               ],
+               #
+               # Cleanup settings
+               "cleanup_after_fail" : True,
+               #
+               # Distro items
+               "distro_name"     : "unknown",
+               "distro_sname"    : "unknown",
+               "distro_epoch"    : "unknown",
+               "distro_version"  : "unknown",
+               "distro_slogan"   : "unknown",
+               #
+               # Downloads
+               "download_object_url" : "http://source.ipfire.org/source-3.x/%(file)s",
+               "download_patch_url"  : "http://source.ipfire.org/source-3.x/%(file)s",
+               #
+               # Logging
+               "log_config_file" : os.path.join(CONFIGDIR, "logging.ini"),
+               "log_file"        : os.path.join(LOGDIR, "naoki.log"),
+       }
+
+       def __init__(self):
+               self.read([CONFIGFILE, os.path.join(BASEDIR, ".config")])
+
+       def read(self, files):
+               parser = ConfigParser.ConfigParser()
+               parser.read(files)
+
+               config = {}
+               for key, val in parser.items(ConfigParser.DEFAULTSECT):
+                       config[key] = val
+
+               for section in parser.sections():
+                       for key, val in parser.items(section):
+                               config["%s_%s" % (section, key)] = val
+
+               self._items.update(config)
+
+       def items(self):
+               return self._items.items()
+
+       def __getitem__(self, item):
+               return self._items[item]
+
+       def __setitem__(self, item, value):
+               self._items[item] = value
+
+       @property
+       def environment(self):
+               return {
+                       "DISTRO_NAME"    : self["distro_name"],
+                       "DISTRO_SNAME"   : self["distro_sname"],
+                       "DISTRO_EPOCH"   : self["distro_epoch"],
+                       "DISTRO_VERSION" : self["distro_version"],
+                       "DISTRO_SLOGAN"  : self["distro_slogan"],
+               }
+
+# Create a globally useable instance of the configuration
+config = Config()
diff --git a/naoki/exception.py b/naoki/exception.py

new file mode 100644 (file)

index 0000000..d0c661f
--- /dev/null
+++ b/naoki/exception.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+
+class Error(Exception):
+       "base class for our errors."
+       def __init__(self, msg, status=None):
+               Exception.__init__(self)
+               self.msg = msg
+               self.resultcode = 1
+               if status is not None:
+                       self.resultcode = status
+
+       def __str__(self):
+               return self.msg
+
+
+class BuildRootLocked(Error):
+       "build root in use by another process."
+       def __init__(self, msg):
+               Error.__init__(self, msg)
+               self.msg = msg
+               self.resultcode = 60
+
+
+class commandTimeoutExpired(Error):
+       def __init__(self, msg):
+               Error.__init__(self, msg)
+               self.msg = msg
+               self.resultcode = 10
+
diff --git a/naoki/logger.py b/naoki/logger.py

new file mode 100644 (file)

index 0000000..a73f5da
--- /dev/null
+++ b/naoki/logger.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python
+
+import curses
+import logging
+import sys
+import time
+
+# defaults to module verbose log
+# does a late binding on log. Forwards all attributes to logger.
+# works around problem where reconfiguring the logging module means loggers
+# configured before reconfig dont output.
+class getLog(object):
+    def __init__(self, name=None, prefix="", *args, **kargs):
+        if name is None:
+            frame = sys._getframe(1)
+            name = frame.f_globals["__name__"]
+
+        self.name = prefix + name
+
+    def __getattr__(self, name):
+        logger = logging.getLogger(self.name)
+        return getattr(logger, name)
+
+
+# Borrowed from tornado
+class _ColorLogFormatter(logging.Formatter):
+    def __init__(self, *args, **kwargs):
+        logging.Formatter.__init__(self, *args, **kwargs)
+        fg_color = curses.tigetstr("setaf") or curses.tigetstr("setf") or ""
+        self._colors = {
+            logging.DEBUG: curses.tparm(fg_color, 4), # Blue
+            logging.INFO: curses.tparm(fg_color, 2), # Green
+            logging.WARNING: curses.tparm(fg_color, 3), # Yellow
+            logging.ERROR: curses.tparm(fg_color, 1), # Red
+        }
+        self._normal = curses.tigetstr("sgr0")
+
+    def format(self, record):
+        try:
+            record.message = record.getMessage()
+        except Exception, e:
+            record.message = "Bad message (%r): %r" % (e, record.__dict__)
+        record.asctime = time.strftime(
+            "%H:%M:%S", self.converter(record.created))
+        prefix = '[%(levelname)7s | %(asctime)s]' % \
+            record.__dict__
+        color = self._colors.get(record.levelno, self._normal)
+        formatted = color + prefix + self._normal + " " + record.message
+        if record.exc_info:
+            if not record.exc_text:
+                record.exc_text = self.formatException(record.exc_info)
+        if record.exc_text:
+            formatted = formatted.rstrip() + "\n" + record.exc_text
+        return formatted.replace("\n", "\n    ")
diff --git a/naoki/package.py b/naoki/package.py

new file mode 100644 (file)

index 0000000..beb991d
--- /dev/null
+++ b/naoki/package.py
@@ -0,0 +1,314 @@
+#!/usr/bin/python
+
+import os
+import sys
+import urlgrabber
+import urlgrabber.progress
+
+import chroot
+import util
+
+from constants import *
+from logger import getLog
+
+def list():
+       pkgs = []
+       for dir in os.listdir(PKGSDIR):
+               if not os.path.isdir(os.path.join(PKGSDIR, dir)):
+                       continue
+               if dir == "toolchain":
+                       continue #XXX
+
+               for package in os.listdir(os.path.join(PKGSDIR, dir)):
+                       package = os.path.join(dir, package)
+                       pkgs.append(Package(package))
+
+       pkgs.sort()
+       return pkgs
+
+def find(s):
+       p = Package(s)
+       if p in list():
+               return p
+
+       for package in list():
+               if package.name.endswith("%s" % s):
+                       return package
+
+def groups():
+       groups = []
+       for package in list():
+               group = package.group
+               if not group in groups:
+                       groups.append(group)
+       groups.sort()
+       return groups
+
+def download(file, type):
+       if not file:
+               return
+
+       dirs = {
+               "object" : TARBALLDIR,
+               "patch"  : PATCHESDIR,
+       }
+       filepath = os.path.join(dirs[type], file)
+       if os.path.exists(filepath):
+               return
+
+       g = urlgrabber.grabber.URLGrabber(
+               user_agent = "%sSourceGrabber/%s" % (config["distro_name"], config["distro_version"],),
+               progress_obj = urlgrabber.progress.TextMeter(),
+               text = "Downloading %s..." % file,
+       )
+       gobj = g.urlopen(config["download_%s_url" % type] % { "file" : file })
+
+       # XXX Need to check SHA1 sum here
+
+       for dir in dirs.values():
+               util.mkdir(dir)
+
+       fobj = open(filepath, "w")
+       fobj.write(gobj.read())
+       fobj.close()
+       gobj.close()
+
+def depsolve(packages, recursive=False):
+               deps = []
+               for package in packages:
+                       if not package:
+                               continue
+                       if package:
+                               deps.append(package)
+
+               if not recursive or not deps:
+                       return deps
+
+               while True:
+                       length = len(deps)
+                       for dep in deps[:]:
+                               deps.extend(dep.deps)
+
+                       new_deps = []
+                       for dep in deps:
+                               if not dep in new_deps:
+                                       new_deps.append(dep)
+
+                       deps = new_deps
+
+                       if length == len(deps):
+                               break
+
+               deps.sort()
+               return deps
+
+class Package(object):
+       info_str = """\
+Name        : %(name)s
+Version     : %(version)s
+Release     : %(release)s
+Group       : %(group)s
+
+%(summary)s
+
+Description :
+%(description)s
+
+Maintainer  : %(maintainer)s
+License     : %(license)s
+
+Built?      : %(isBuilt)s
+Can build?  : %(canBuild)s
+
+Files       :
+%(objects)s
+
+Patches     :
+%(patches)s
+"""
+
+       def __init__(self, name):
+               self._name = name
+
+               self.config = config
+               self.__fetch_data = None
+
+       def __str__(self):
+               return "%-20s %14s | %-24s | %s" % (self.name, "%s-%s" % \
+                       (self.version, self.release), self.group, self.summary)
+
+       def __repr__(self):
+               return "<Package %s>" % self.name
+
+       def __cmp__(self, other):
+               return cmp(self.name, other.name)
+
+       def fetch(self, key=None):
+               return self.__fetch()[key]
+
+       def __fetch(self):
+               if not self.__fetch_data:
+                       env = os.environ.copy()
+                       env.update(config.environment)
+                       env["PKGROOT"] = PKGSDIR
+                       output = util.do("make -f %s" % self.filename, shell=True,
+                               cwd=os.path.join(PKGSDIR, self.name), returnOutput=1, env=env)
+       
+                       ret = {}
+                       for line in output.splitlines():
+                               a = line.split("=", 1)
+                               if not len(a) == 2: continue
+                               key, val = a
+                               ret[key] = val.strip("\"")
+       
+                       ret["FINGERPRINT"] = self.fingerprint
+                       self.__fetch_data = ret
+
+               return self.__fetch_data
+
+       def download(self):
+               for object in self.objects:
+                       download(object, type="object")
+               for patch in self.patches:
+                       download(patch, type="patch")
+
+       @property
+       def fingerprint(self):
+               return str(os.stat(self.filename).st_mtime)
+
+       @property
+       def filename(self):
+               return os.path.join(PKGSDIR, self.name, os.path.basename(self.name)) + ".nm"
+
+       @property
+       def name(self):
+               return self._name
+
+       @property
+       def version(self):
+               return self.fetch("PKG_VER")
+
+       @property
+       def release(self):
+               return self.fetch("PKG_REL")
+
+       @property
+       def summary(self):
+               return self.fetch("PKG_SUMMARY")
+
+       @property
+       def description(self):
+               return self.fetch("PKG_DESCRIPTION")
+
+       @property
+       def group(self):
+               return self.fetch("PKG_GROUP")
+
+       @property
+       def packages(self):
+               return self.fetch("PKG_PACKAGES")
+
+       @property
+       def package_files(self):
+               return sorted(self.fetch("PKG_PACKAGES_FILES").split(" "))
+
+       @property
+       def objects(self):
+               objects = []
+               for object in sorted(self.fetch("PKG_OBJECTS").split(" ")):
+                       if not object in self.patches:
+                               objects.append(object)
+               return objects
+
+       @property
+       def patches(self):
+               return sorted(self.fetch("PKG_PATCHES").split(" "))
+
+       @property
+       def maintainer(self):
+               return self.fetch("PKG_MAINTAINER")
+
+       @property
+       def deps(self):
+               return self.getDeps()
+
+       def getDeps(self, recursive=False):
+               deps = []
+               for package in self.fetch("PKG_DEPENDENCIES").split(" "):
+                       package = find(package)
+                       if package:
+                               deps.append(package)
+               return depsolve(deps, recursive)
+
+       @property
+       def build_deps(self):
+               deps = []
+               for package in self.fetch("PKG_BUILD_DEPENDENCIES").split(" "):
+                       deps.append(find(package))
+               
+               deps.sort()
+               return deps
+
+       @property
+       def url(self):
+               return self.fetch("PKG_URL")
+
+       @property
+       def id(self):
+               return "%s-%s-%s" % (self.name, self.version, self.release)
+
+       @property
+       def license(self):
+               return self.fetch("PKG_LICENSE")
+
+       @property
+       def __info(self):
+               return {
+                       "name" : self.name,
+                       "version" : self.version,
+                       "release" : self.release,
+                       "summary" : self.summary,
+                       "description" : self.description,
+                       "maintainer" : self.maintainer,
+                       "objects" : self.objects,
+                       "patches" : self.patches,
+                       "group" : self.group,
+                       "license" : self.license,
+                       "isBuilt" : self.isBuilt,
+                       "canBuild" : self.canBuild,
+               }
+
+       @property
+       def info(self, wiki=False):
+               if wiki:
+                       pass
+               else:
+                       info = self.__info
+                       info["objects"] = "\n".join(info["objects"])
+                       info["patches"] = "\n".join(info["patches"])
+                       return self.info_str % info
+
+       @property
+       def isBuilt(self):
+               for item in self.package_files:
+                       if not os.path.exists(os.path.join(PACKAGESDIR, item)):
+                               return False
+               return True
+
+       @property
+       def canBuild(self):
+               deps = self.deps + self.build_deps
+               for dep in deps:
+                       if not dep.isBuilt:
+                               return False
+
+               return True
+
+       def extract(self, dest):
+               files = [os.path.join(PACKAGESDIR, file) for file in self.package_files]
+               if not files:
+                       return
+
+               getLog().debug("Extracting %s..." % self.name)
+               util.do("%s --root=%s %s" % (os.path.join(TOOLSDIR, "decompressor"),
+                       dest, " ".join(files)), shell=True)
diff --git a/naoki/urlgrabber/__init__.py b/naoki/urlgrabber/__init__.py

new file mode 100644 (file)

index 0000000..1ddbca1
--- /dev/null
+++ b/naoki/urlgrabber/__init__.py
@@ -0,0 +1,55 @@
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko
+# Copyright 2009 Red Hat, Inc - pycurl support added by Seth Vidal
+
+# $Id: __init__.py,v 1.20 2006/09/22 00:58:55 mstenner Exp $
+
+"""A high-level cross-protocol url-grabber.
+
+Using urlgrabber, data can be fetched in three basic ways:
+
+  urlgrab(url) copy the file to the local filesystem
+  urlopen(url) open the remote file and return a file object
+     (like urllib2.urlopen)
+  urlread(url) return the contents of the file as a string
+
+When using these functions (or methods), urlgrabber supports the
+following features:
+
+  * identical behavior for http://, ftp://, and file:// urls
+  * http keepalive - faster downloads of many files by using
+    only a single connection
+  * byte ranges - fetch only a portion of the file
+  * reget - for a urlgrab, resume a partial download
+  * progress meters - the ability to report download progress
+    automatically, even when using urlopen!
+  * throttling - restrict bandwidth usage
+  * retries - automatically retry a download if it fails. The
+    number of retries and failure types are configurable.
+  * authenticated server access for http and ftp
+  * proxy support - support for authenticated http and ftp proxies
+  * mirror groups - treat a list of mirrors as a single source,
+    automatically switching mirrors if there is a failure.
+"""
+
+__version__ = '3.9.0'
+__date__    = '2009/07/31'
+__author__  = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \
+              'Ryan Tomayko <rtomayko@naeblis.cx>' \
+              'Seth Vidal <skvidal@fedoraproject.org>' 
+__url__     = 'http://linux.duke.edu/projects/urlgrabber/'
+
+from grabber import urlgrab, urlopen, urlread
diff --git a/naoki/urlgrabber/byterange.py b/naoki/urlgrabber/byterange.py

new file mode 100644 (file)

index 0000000..e037562
--- /dev/null
+++ b/naoki/urlgrabber/byterange.py
@@ -0,0 +1,465 @@
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License as published by the Free Software Foundation; either
+#   version 2.1 of the License, or (at your option) any later version.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, write to the 
+#      Free Software Foundation, Inc., 
+#      59 Temple Place, Suite 330, 
+#      Boston, MA  02111-1307  USA
+
+# This file is part of urlgrabber, a high-level cross-protocol url-grabber
+# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+
+# $Id: byterange.py,v 1.12 2006/07/20 20:15:58 mstenner Exp $
+
+import os
+import stat
+import urllib
+import urllib2
+import rfc822
+
+DEBUG = None
+
+try:    
+    from cStringIO import StringIO
+except ImportError, msg: 
+    from StringIO import StringIO
+
+class RangeError(IOError):
+    """Error raised when an unsatisfiable range is requested."""
+    pass
+    
+class HTTPRangeHandler(urllib2.BaseHandler):
+    """Handler that enables HTTP Range headers.
+    
+    This was extremely simple. The Range header is a HTTP feature to
+    begin with so all this class does is tell urllib2 that the 
+    "206 Partial Content" reponse from the HTTP server is what we 
+    expected.
+    
+    Example:
+        import urllib2
+        import byterange
+        
+        range_handler = range.HTTPRangeHandler()
+        opener = urllib2.build_opener(range_handler)
+        
+        # install it
+        urllib2.install_opener(opener)
+        
+        # create Request and set Range header
+        req = urllib2.Request('http://www.python.org/')
+        req.header['Range'] = 'bytes=30-50'
+        f = urllib2.urlopen(req)
+    """
+    
+    def http_error_206(self, req, fp, code, msg, hdrs):
+        # 206 Partial Content Response
+        r = urllib.addinfourl(fp, hdrs, req.get_full_url())
+        r.code = code
+        r.msg = msg
+        return r
+    
+    def http_error_416(self, req, fp, code, msg, hdrs):
+        # HTTP's Range Not Satisfiable error
+        raise RangeError('Requested Range Not Satisfiable')
+
+class HTTPSRangeHandler(HTTPRangeHandler):
+    """ Range Header support for HTTPS. """
+
+    def https_error_206(self, req, fp, code, msg, hdrs):
+        return self.http_error_206(req, fp, code, msg, hdrs)
+
+    def https_error_416(self, req, fp, code, msg, hdrs):
+        self.https_error_416(req, fp, code, msg, hdrs)
+
+class RangeableFileObject:
+    """File object wrapper to enable raw range handling.
+    This was implemented primarilary for handling range 
+    specifications for file:// urls. This object effectively makes 
+    a file object look like it consists only of a range of bytes in 
+    the stream.
+    
+    Examples:
+        # expose 10 bytes, starting at byte position 20, from 
+        # /etc/aliases.
+        >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30))
+        # seek seeks within the range (to position 23 in this case)
+        >>> fo.seek(3)
+        # tell tells where your at _within the range_ (position 3 in
+        # this case)
+        >>> fo.tell()
+        # read EOFs if an attempt is made to read past the last
+        # byte in the range. the following will return only 7 bytes.
+        >>> fo.read(30)
+    """
+    
+    def __init__(self, fo, rangetup):
+        """Create a RangeableFileObject.
+        fo       -- a file like object. only the read() method need be 
+                    supported but supporting an optimized seek() is 
+                    preferable.
+        rangetup -- a (firstbyte,lastbyte) tuple specifying the range
+                    to work over.
+        The file object provided is assumed to be at byte offset 0.
+        """
+        self.fo = fo
+        (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
+        self.realpos = 0
+        self._do_seek(self.firstbyte)
+        
+    def __getattr__(self, name):
+        """This effectively allows us to wrap at the instance level.
+        Any attribute not found in _this_ object will be searched for
+        in self.fo.  This includes methods."""
+        if hasattr(self.fo, name):
+            return getattr(self.fo, name)
+        raise AttributeError, name
+    
+    def tell(self):
+        """Return the position within the range.
+        This is different from fo.seek in that position 0 is the 
+        first byte position of the range tuple. For example, if
+        this object was created with a range tuple of (500,899),
+        tell() will return 0 when at byte position 500 of the file.
+        """
+        return (self.realpos - self.firstbyte)
+    
+    def seek(self,offset,whence=0):
+        """Seek within the byte range.
+        Positioning is identical to that described under tell().
+        """
+        assert whence in (0, 1, 2)
+        if whence == 0:   # absolute seek
+            realoffset = self.firstbyte + offset
+        elif whence == 1: # relative seek
+            realoffset = self.realpos + offset
+        elif whence == 2: # absolute from end of file
+            # XXX: are we raising the right Error here?
+            raise IOError('seek from end of file not supported.')
+        
+        # do not allow seek past lastbyte in range
+        if self.lastbyte and (realoffset >= self.lastbyte):
+            realoffset = self.lastbyte
+        
+        self._do_seek(realoffset - self.realpos)
+        
+    def read(self, size=-1):
+        """Read within the range.
+        This method will limit the size read based on the range.
+        """
+        size = self._calc_read_size(size)
+        rslt = self.fo.read(size)
+        self.realpos += len(rslt)
+        return rslt
+    
+    def readline(self, size=-1):
+        """Read lines within the range.
+        This method will limit the size read based on the range.
+        """
+        size = self._calc_read_size(size)
+        rslt = self.fo.readline(size)
+        self.realpos += len(rslt)
+        return rslt
+    
+    def _calc_read_size(self, size):
+        """Handles calculating the amount of data to read based on
+        the range.
+        """
+        if self.lastbyte:
+            if size > -1:
+                if ((self.realpos + size) >= self.lastbyte):
+                    size = (self.lastbyte - self.realpos)
+            else:
+                size = (self.lastbyte - self.realpos)
+        return size
+        
+    def _do_seek(self,offset):
+        """Seek based on whether wrapped object supports seek().
+        offset is relative to the current position (self.realpos).
+        """
+        assert offset >= 0
+        if not hasattr(self.fo, 'seek'):
+            self._poor_mans_seek(offset)
+        else:
+            self.fo.seek(self.realpos + offset)
+        self.realpos+= offset
+        
+    def _poor_mans_seek(self,offset):
+        """Seek by calling the wrapped file objects read() method.
+        This is used for file like objects that do not have native
+        seek support. The wrapped objects read() method is called
+        to manually seek to the desired position.
+        offset -- read this number of bytes from the wrapped
+                  file object.
+        raise RangeError if we encounter EOF before reaching the 
+        specified offset.
+        """
+        pos = 0
+        bufsize = 1024
+        while pos < offset:
+            if (pos + bufsize) > offset:
+                bufsize = offset - pos
+            buf = self.fo.read(bufsize)
+            if len(buf) != bufsize:
+                raise RangeError('Requested Range Not Satisfiable')
+            pos+= bufsize
+
+class FileRangeHandler(urllib2.FileHandler):
+    """FileHandler subclass that adds Range support.
+    This class handles Range headers exactly like an HTTP
+    server would.
+    """
+    def open_local_file(self, req):
+        import mimetypes
+        import mimetools
+        host = req.get_host()
+        file = req.get_selector()
+        localfile = urllib.url2pathname(file)
+        stats = os.stat(localfile)
+        size = stats[stat.ST_SIZE]
+        modified = rfc822.formatdate(stats[stat.ST_MTIME])
+        mtype = mimetypes.guess_type(file)[0]
+        if host:
+            host, port = urllib.splitport(host)
+            if port or socket.gethostbyname(host) not in self.get_names():
+                raise urllib2.URLError('file not on local host')
+        fo = open(localfile,'rb')
+        brange = req.headers.get('Range',None)
+        brange = range_header_to_tuple(brange)
+        assert brange != ()
+        if brange:
+            (fb,lb) = brange
+            if lb == '': lb = size
+            if fb < 0 or fb > size or lb > size:
+                raise RangeError('Requested Range Not Satisfiable')
+            size = (lb - fb)
+            fo = RangeableFileObject(fo, (fb,lb))
+        headers = mimetools.Message(StringIO(
+            'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
+            (mtype or 'text/plain', size, modified)))
+        return urllib.addinfourl(fo, headers, 'file:'+file)
+
+
+# FTP Range Support 
+# Unfortunately, a large amount of base FTP code had to be copied
+# from urllib and urllib2 in order to insert the FTP REST command.
+# Code modifications for range support have been commented as 
+# follows:
+# -- range support modifications start/end here
+
+from urllib import splitport, splituser, splitpasswd, splitattr, \
+                   unquote, addclosehook, addinfourl
+import ftplib
+import socket
+import sys
+import ftplib
+import mimetypes
+import mimetools
+
+class FTPRangeHandler(urllib2.FTPHandler):
+    def ftp_open(self, req):
+        host = req.get_host()
+        if not host:
+            raise IOError, ('ftp error', 'no host given')
+        host, port = splitport(host)
+        if port is None:
+            port = ftplib.FTP_PORT
+        else:
+            port = int(port)
+
+        # username/password handling
+        user, host = splituser(host)
+        if user:
+            user, passwd = splitpasswd(user)
+        else:
+            passwd = None
+        host = unquote(host)
+        user = unquote(user or '')
+        passwd = unquote(passwd or '')
+        
+        try:
+            host = socket.gethostbyname(host)
+        except socket.error, msg:
+            raise urllib2.URLError(msg)
+        path, attrs = splitattr(req.get_selector())
+        dirs = path.split('/')
+        dirs = map(unquote, dirs)
+        dirs, file = dirs[:-1], dirs[-1]
+        if dirs and not dirs[0]:
+            dirs = dirs[1:]
+        try:
+            fw = self.connect_ftp(user, passwd, host, port, dirs)
+            type = file and 'I' or 'D'
+            for attr in attrs:
+                attr, value = splitattr(attr)
+                if attr.lower() == 'type' and \
+                   value in ('a', 'A', 'i', 'I', 'd', 'D'):
+                    type = value.upper()
+            
+            # -- range support modifications start here
+            rest = None
+            range_tup = range_header_to_tuple(req.headers.get('Range',None))    
+            assert range_tup != ()
+            if range_tup:
+                (fb,lb) = range_tup
+                if fb > 0: rest = fb
+            # -- range support modifications end here
+            
+            fp, retrlen = fw.retrfile(file, type, rest)
+            
+            # -- range support modifications start here
+            if range_tup:
+                (fb,lb) = range_tup
+                if lb == '': 
+                    if retrlen is None or retrlen == 0:
+                        raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
+                    lb = retrlen
+                    retrlen = lb - fb
+                    if retrlen < 0:
+                        # beginning of range is larger than file
+                        raise RangeError('Requested Range Not Satisfiable')
+                else:
+                    retrlen = lb - fb
+                    fp = RangeableFileObject(fp, (0,retrlen))
+            # -- range support modifications end here
+            
+            headers = ""
+            mtype = mimetypes.guess_type(req.get_full_url())[0]
+            if mtype:
+                headers += "Content-Type: %s\n" % mtype
+            if retrlen is not None and retrlen >= 0:
+                headers += "Content-Length: %d\n" % retrlen
+            sf = StringIO(headers)
+            headers = mimetools.Message(sf)
+            return addinfourl(fp, headers, req.get_full_url())
+        except ftplib.all_errors, msg:
+            raise IOError, ('ftp error', msg), sys.exc_info()[2]
+
+    def connect_ftp(self, user, passwd, host, port, dirs):
+        fw = ftpwrapper(user, passwd, host, port, dirs)
+        return fw
+
+class ftpwrapper(urllib.ftpwrapper):
+    # range support note:
+    # this ftpwrapper code is copied directly from
+    # urllib. The only enhancement is to add the rest
+    # argument and pass it on to ftp.ntransfercmd
+    def retrfile(self, file, type, rest=None):
+        self.endtransfer()
+        if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
+        else: cmd = 'TYPE ' + type; isdir = 0
+        try:
+            self.ftp.voidcmd(cmd)
+        except ftplib.all_errors:
+            self.init()
+            self.ftp.voidcmd(cmd)
+        conn = None
+        if file and not isdir:
+            # Use nlst to see if the file exists at all
+            try:
+                self.ftp.nlst(file)
+            except ftplib.error_perm, reason:
+                raise IOError, ('ftp error', reason), sys.exc_info()[2]
+            # Restore the transfer mode!
+            self.ftp.voidcmd(cmd)
+            # Try to retrieve as a file
+            try:
+                cmd = 'RETR ' + file
+                conn = self.ftp.ntransfercmd(cmd, rest)
+            except ftplib.error_perm, reason:
+                if str(reason)[:3] == '501':
+                    # workaround for REST not supported error
+                    fp, retrlen = self.retrfile(file, type)
+                    fp = RangeableFileObject(fp, (rest,''))
+                    return (fp, retrlen)
+                elif str(reason)[:3] != '550':
+                    raise IOError, ('ftp error', reason), sys.exc_info()[2]
+        if not conn:
+            # Set transfer mode to ASCII!
+            self.ftp.voidcmd('TYPE A')
+            # Try a directory listing
+            if file: cmd = 'LIST ' + file
+            else: cmd = 'LIST'
+            conn = self.ftp.ntransfercmd(cmd)
+        self.busy = 1
+        # Pass back both a suitably decorated object and a retrieval length
+        return (addclosehook(conn[0].makefile('rb'),
+                            self.endtransfer), conn[1])
+
+
+####################################################################
+# Range Tuple Functions
+# XXX: These range tuple functions might go better in a class.
+
+_rangere = None
+def range_header_to_tuple(range_header):
+    """Get a (firstbyte,lastbyte) tuple from a Range header value.
+    
+    Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
+    function pulls the firstbyte and lastbyte values and returns
+    a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
+    the header value, it is returned as an empty string in the
+    tuple.
+    
+    Return None if range_header is None
+    Return () if range_header does not conform to the range spec 
+    pattern.
+    
+    """
+    global _rangere
+    if range_header is None: return None
+    if _rangere is None:
+        import re
+        _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)')
+    match = _rangere.match(range_header)
+    if match: 
+        tup = range_tuple_normalize(match.group(1,2))
+        if tup and tup[1]: 
+            tup = (tup[0],tup[1]+1)
+        return tup
+    return ()
+
+def range_tuple_to_header(range_tup):
+    """Convert a range tuple to a Range header value.
+    Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None
+    if no range is needed.
+    """
+    if range_tup is None: return None
+    range_tup = range_tuple_normalize(range_tup)
+    if range_tup:
+        if range_tup[1]: 
+            range_tup = (range_tup[0],range_tup[1] - 1)
+        return 'bytes=%s-%s' % range_tup
+    
+def range_tuple_normalize(range_tup):
+    """Normalize a (first_byte,last_byte) range tuple.
+    Return a tuple whose first element is guaranteed to be an int
+    and whose second element will be '' (meaning: the last byte) or 
+    an int. Finally, return None if the normalized tuple == (0,'')
+    as that is equivelant to retrieving the entire file.
+    """
+    if range_tup is None: return None
+    # handle first byte
+    fb = range_tup[0]
+    if fb in (None,''): fb = 0
+    else: fb = int(fb)
+    # handle last byte
+    try: lb = range_tup[1]
+    except IndexError: lb = ''
+    else:  
+        if lb is None: lb = ''
+        elif lb != '': lb = int(lb)
+    # check if range is over the entire file
+    if (fb,lb) == (0,''): return None
+    # check that the range is valid
+    if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
+    return (fb,lb)
+
diff --git a/naoki/urlgrabber/grabber.py b/naoki/urlgrabber/grabber.py

new file mode 100644 (file)

index 0000000..cf51dff
--- /dev/null
+++ b/naoki/urlgrabber/grabber.py
@@ -0,0 +1,2119 @@
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License as published by the Free Software Foundation; either
+#   version 2.1 of the License, or (at your option) any later version.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, write to the 
+#      Free Software Foundation, Inc., 
+#      59 Temple Place, Suite 330, 
+#      Boston, MA  02111-1307  USA
+
+# This file is part of urlgrabber, a high-level cross-protocol url-grabber
+# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
+
+"""A high-level cross-protocol url-grabber.
+
+GENERAL ARGUMENTS (kwargs)
+
+  Where possible, the module-level default is indicated, and legal
+  values are provided.
+
+  copy_local = 0   [0|1]
+
+    ignored except for file:// urls, in which case it specifies
+    whether urlgrab should still make a copy of the file, or simply
+    point to the existing copy. The module level default for this
+    option is 0.
+
+  close_connection = 0   [0|1]
+
+    tells URLGrabber to close the connection after a file has been
+    transfered. This is ignored unless the download happens with the
+    http keepalive handler (keepalive=1).  Otherwise, the connection
+    is left open for further use. The module level default for this
+    option is 0 (keepalive connections will not be closed).
+
+  keepalive = 1   [0|1]
+
+    specifies whether keepalive should be used for HTTP/1.1 servers
+    that support it. The module level default for this option is 1
+    (keepalive is enabled).
+
+  progress_obj = None
+
+    a class instance that supports the following methods:
+      po.start(filename, url, basename, length, text)
+      # length will be None if unknown
+      po.update(read) # read == bytes read so far
+      po.end()
+
+  text = None
+  
+    specifies alternative text to be passed to the progress meter
+    object.  If not given, the default progress meter will use the
+    basename of the file.
+
+  throttle = 1.0
+
+    a number - if it's an int, it's the bytes/second throttle limit.
+    If it's a float, it is first multiplied by bandwidth.  If throttle
+    == 0, throttling is disabled.  If None, the module-level default
+    (which can be set on default_grabber.throttle) is used. See
+    BANDWIDTH THROTTLING for more information.
+
+  timeout = None
+
+    a positive float expressing the number of seconds to wait for socket
+    operations. If the value is None or 0.0, socket operations will block
+    forever. Setting this option causes urlgrabber to call the settimeout
+    method on the Socket object used for the request. See the Python
+    documentation on settimeout for more information.
+    http://www.python.org/doc/current/lib/socket-objects.html
+
+  bandwidth = 0
+
+    the nominal max bandwidth in bytes/second.  If throttle is a float
+    and bandwidth == 0, throttling is disabled.  If None, the
+    module-level default (which can be set on
+    default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
+    more information.
+
+  range = None
+
+    a tuple of the form (first_byte, last_byte) describing a byte
+    range to retrieve. Either or both of the values may set to
+    None. If first_byte is None, byte offset 0 is assumed. If
+    last_byte is None, the last byte available is assumed. Note that
+    the range specification is python-like in that (0,10) will yeild
+    the first 10 bytes of the file.
+
+    If set to None, no range will be used.
+    
+  reget = None   [None|'simple'|'check_timestamp']
+
+    whether to attempt to reget a partially-downloaded file.  Reget
+    only applies to .urlgrab and (obviously) only if there is a
+    partially downloaded file.  Reget has two modes:
+
+      'simple' -- the local file will always be trusted.  If there
+        are 100 bytes in the local file, then the download will always
+        begin 100 bytes into the requested file.
+
+      'check_timestamp' -- the timestamp of the server file will be
+        compared to the timestamp of the local file.  ONLY if the
+        local file is newer than or the same age as the server file
+        will reget be used.  If the server file is newer, or the
+        timestamp is not returned, the entire file will be fetched.
+
+    NOTE: urlgrabber can do very little to verify that the partial
+    file on disk is identical to the beginning of the remote file.
+    You may want to either employ a custom "checkfunc" or simply avoid
+    using reget in situations where corruption is a concern.
+
+  user_agent = 'urlgrabber/VERSION'
+
+    a string, usually of the form 'AGENT/VERSION' that is provided to
+    HTTP servers in the User-agent header. The module level default
+    for this option is "urlgrabber/VERSION".
+
+  http_headers = None
+
+    a tuple of 2-tuples, each containing a header and value.  These
+    will be used for http and https requests only.  For example, you
+    can do
+      http_headers = (('Pragma', 'no-cache'),)
+
+  ftp_headers = None
+
+    this is just like http_headers, but will be used for ftp requests.
+
+  proxies = None
+
+    a dictionary that maps protocol schemes to proxy hosts. For
+    example, to use a proxy server on host "foo" port 3128 for http
+    and https URLs:
+      proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
+    note that proxy authentication information may be provided using
+    normal URL constructs:
+      proxies={ 'http' : 'http://user:host@foo:3128' }
+    Lastly, if proxies is None, the default environment settings will
+    be used.
+
+  prefix = None
+
+    a url prefix that will be prepended to all requested urls.  For
+    example:
+      g = URLGrabber(prefix='http://foo.com/mirror/')
+      g.urlgrab('some/file.txt')
+      ## this will fetch 'http://foo.com/mirror/some/file.txt'
+    This option exists primarily to allow identical behavior to
+    MirrorGroup (and derived) instances.  Note: a '/' will be inserted
+    if necessary, so you cannot specify a prefix that ends with a
+    partial file or directory name.
+
+  opener = None
+  
+    Overrides the default urllib2.OpenerDirector provided to urllib2
+    when making requests.  This option exists so that the urllib2
+    handler chain may be customized.  Note that the range, reget,
+    proxy, and keepalive features require that custom handlers be
+    provided to urllib2 in order to function properly.  If an opener
+    option is provided, no attempt is made by urlgrabber to ensure
+    chain integrity.  You are responsible for ensuring that any
+    extension handlers are present if said features are required.
+    
+  cache_openers = True
+
+    controls whether urllib2 openers should be cached and reused, or
+    whether they should be created each time.  There's a modest
+    overhead in recreating them, but it's slightly safer to do so if
+    you're modifying the handlers between calls.
+
+  data = None
+
+    Only relevant for the HTTP family (and ignored for other
+    protocols), this allows HTTP POSTs.  When the data kwarg is
+    present (and not None), an HTTP request will automatically become
+    a POST rather than GET.  This is done by direct passthrough to
+    urllib2.  If you use this, you may also want to set the
+    'Content-length' and 'Content-type' headers with the http_headers
+    option.  Note that python 2.2 handles the case of these
+    badly and if you do not use the proper case (shown here), your
+    values will be overridden with the defaults.
+    
+  urlparser = URLParser()
+
+    The URLParser class handles pre-processing of URLs, including
+    auth-handling for user/pass encoded in http urls, file handing
+    (that is, filenames not sent as a URL), and URL quoting.  If you
+    want to override any of this behavior, you can pass in a
+    replacement instance.  See also the 'quote' option.
+
+  quote = None
+
+    Whether or not to quote the path portion of a url.
+      quote = 1    ->  quote the URLs (they're not quoted yet)
+      quote = 0    ->  do not quote them (they're already quoted)
+      quote = None ->  guess what to do
+
+    This option only affects proper urls like 'file:///etc/passwd'; it
+    does not affect 'raw' filenames like '/etc/passwd'.  The latter
+    will always be quoted as they are converted to URLs.  Also, only
+    the path part of a url is quoted.  If you need more fine-grained
+    control, you should probably subclass URLParser and pass it in via
+    the 'urlparser' option.
+
+  ssl_ca_cert = None
+
+    this option can be used if M2Crypto is available and will be
+    ignored otherwise.  If provided, it will be used to create an SSL
+    context.  If both ssl_ca_cert and ssl_context are provided, then
+    ssl_context will be ignored and a new context will be created from
+    ssl_ca_cert.
+
+  ssl_context = None
+
+    this option can be used if M2Crypto is available and will be
+    ignored otherwise.  If provided, this SSL context will be used.
+    If both ssl_ca_cert and ssl_context are provided, then ssl_context
+    will be ignored and a new context will be created from
+    ssl_ca_cert.
+    
+
+RETRY RELATED ARGUMENTS
+
+  retry = None
+
+    the number of times to retry the grab before bailing.  If this is
+    zero, it will retry forever. This was intentional... really, it
+    was :). If this value is not supplied or is supplied but is None
+    retrying does not occur.
+
+  retrycodes = [-1,2,4,5,6,7]
+
+    a sequence of errorcodes (values of e.errno) for which it should
+    retry. See the doc on URLGrabError for more details on this.  You
+    might consider modifying a copy of the default codes rather than
+    building yours from scratch so that if the list is extended in the
+    future (or one code is split into two) you can still enjoy the
+    benefits of the default list.  You can do that with something like
+    this:
+
+      retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
+      if 12 not in retrycodes:
+          retrycodes.append(12)
+      
+  checkfunc = None
+
+    a function to do additional checks. This defaults to None, which
+    means no additional checking.  The function should simply return
+    on a successful check.  It should raise URLGrabError on an
+    unsuccessful check.  Raising of any other exception will be
+    considered immediate failure and no retries will occur.
+
+    If it raises URLGrabError, the error code will determine the retry
+    behavior.  Negative error numbers are reserved for use by these
+    passed in functions, so you can use many negative numbers for
+    different types of failure.  By default, -1 results in a retry,
+    but this can be customized with retrycodes.
+
+    If you simply pass in a function, it will be given exactly one
+    argument: a CallbackObject instance with the .url attribute
+    defined and either .filename (for urlgrab) or .data (for urlread).
+    For urlgrab, .filename is the name of the local file.  For
+    urlread, .data is the actual string data.  If you need other
+    arguments passed to the callback (program state of some sort), you
+    can do so like this:
+
+      checkfunc=(function, ('arg1', 2), {'kwarg': 3})
+
+    if the downloaded file has filename /tmp/stuff, then this will
+    result in this call (for urlgrab):
+
+      function(obj, 'arg1', 2, kwarg=3)
+      # obj.filename = '/tmp/stuff'
+      # obj.url = 'http://foo.com/stuff'
+      
+    NOTE: both the "args" tuple and "kwargs" dict must be present if
+    you use this syntax, but either (or both) can be empty.
+
+  failure_callback = None
+
+    The callback that gets called during retries when an attempt to
+    fetch a file fails.  The syntax for specifying the callback is
+    identical to checkfunc, except for the attributes defined in the
+    CallbackObject instance.  The attributes for failure_callback are:
+
+      exception = the raised exception
+      url       = the url we're trying to fetch
+      tries     = the number of tries so far (including this one)
+      retry     = the value of the retry option
+
+    The callback is present primarily to inform the calling program of
+    the failure, but if it raises an exception (including the one it's
+    passed) that exception will NOT be caught and will therefore cause
+    future retries to be aborted.
+
+    The callback is called for EVERY failure, including the last one.
+    On the last try, the callback can raise an alternate exception,
+    but it cannot (without severe trickiness) prevent the exception
+    from being raised.
+
+  interrupt_callback = None
+
+    This callback is called if KeyboardInterrupt is received at any
+    point in the transfer.  Basically, this callback can have three
+    impacts on the fetch process based on the way it exits:
+
+      1) raise no exception: the current fetch will be aborted, but
+         any further retries will still take place
+
+      2) raise a URLGrabError: if you're using a MirrorGroup, then
+         this will prompt a failover to the next mirror according to
+         the behavior of the MirrorGroup subclass.  It is recommended
+         that you raise URLGrabError with code 15, 'user abort'.  If
+         you are NOT using a MirrorGroup subclass, then this is the
+         same as (3).
+
+      3) raise some other exception (such as KeyboardInterrupt), which
+         will not be caught at either the grabber or mirror levels.
+         That is, it will be raised up all the way to the caller.
+
+    This callback is very similar to failure_callback.  They are
+    passed the same arguments, so you could use the same function for
+    both.
+      
+BANDWIDTH THROTTLING
+
+  urlgrabber supports throttling via two values: throttle and
+  bandwidth Between the two, you can either specify and absolute
+  throttle threshold or specify a theshold as a fraction of maximum
+  available bandwidth.
+
+  throttle is a number - if it's an int, it's the bytes/second
+  throttle limit.  If it's a float, it is first multiplied by
+  bandwidth.  If throttle == 0, throttling is disabled.  If None, the
+  module-level default (which can be set with set_throttle) is used.
+
+  bandwidth is the nominal max bandwidth in bytes/second.  If throttle
+  is a float and bandwidth == 0, throttling is disabled.  If None, the
+  module-level default (which can be set with set_bandwidth) is used.
+
+  THROTTLING EXAMPLES:
+
+  Lets say you have a 100 Mbps connection.  This is (about) 10^8 bits
+  per second, or 12,500,000 Bytes per second.  You have a number of
+  throttling options:
+
+  *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
+
+     This will limit urlgrab to use half of your available bandwidth.
+
+  *) set_throttle(6250000) # throttle is an int
+
+     This will also limit urlgrab to use half of your available
+     bandwidth, regardless of what bandwidth is set to.
+
+  *) set_throttle(6250000); set_throttle(1.0) # float
+
+     Use half your bandwidth
+
+  *) set_throttle(6250000); set_throttle(2.0) # float
+
+    Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
+
+  *) set_throttle(6250000); set_throttle(0) # throttle = 0
+
+     Disable throttling - this is more efficient than a very large
+     throttle setting.
+
+  *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
+
+     Disable throttling - this is the default when the module is loaded.
+
+  SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
+
+  While this is flexible, it's not extremely obvious to the user.  I
+  suggest you implement a float throttle as a percent to make the
+  distinction between absolute and relative throttling very explicit.
+
+  Also, you may want to convert the units to something more convenient
+  than bytes/second, such as kbps or kB/s, etc.
+
+"""
+
+# $Id: grabber.py,v 1.52 2006/12/12 19:08:46 mstenner Exp $
+
+import os
+import os.path
+import sys
+import urlparse
+import rfc822
+import time
+import string
+import urllib
+import urllib2
+import mimetools
+import thread
+from stat import *  # S_* and ST_*
+import pycurl
+from ftplib import parse150
+from StringIO import StringIO
+from tempfile import mkstemp
+
+########################################################################
+#                     MODULE INITIALIZATION
+########################################################################
+try:
+    exec('from ' + (__name__.split('.'))[0] + ' import __version__')
+except:
+    __version__ = '???'
+
+import sslfactory
+
+auth_handler = urllib2.HTTPBasicAuthHandler( \
+     urllib2.HTTPPasswordMgrWithDefaultRealm())
+
+try:
+    from i18n import _
+except ImportError, msg:
+    def _(st): return st
+
+try:
+    from httplib import HTTPException
+except ImportError, msg:
+    HTTPException = None
+
+try:
+    # This is a convenient way to make keepalive optional.
+    # Just rename the module so it can't be imported.
+    import keepalive
+    from keepalive import HTTPHandler, HTTPSHandler
+    have_keepalive = True
+    keepalive_http_handler = HTTPHandler()
+except ImportError, msg:
+    have_keepalive = False
+    keepalive_http_handler = None
+
+try:
+    # add in range support conditionally too
+    import byterange
+    from byterange import HTTPRangeHandler, HTTPSRangeHandler, \
+         FileRangeHandler, FTPRangeHandler, range_tuple_normalize, \
+         range_tuple_to_header, RangeError
+except ImportError, msg:
+    range_handlers = ()
+    RangeError = None
+    have_range = 0
+else:
+    range_handlers = (HTTPRangeHandler(), HTTPSRangeHandler(),
+        FileRangeHandler(), FTPRangeHandler())
+    have_range = 1
+
+
+# check whether socket timeout support is available (Python >= 2.3)
+import socket
+try:
+    TimeoutError = socket.timeout
+    have_socket_timeout = True
+except AttributeError:
+    TimeoutError = None
+    have_socket_timeout = False
+
+########################################################################
+# functions for debugging output.  These functions are here because they
+# are also part of the module initialization.
+DEBUG = None
+def set_logger(DBOBJ):
+    """Set the DEBUG object.  This is called by _init_default_logger when
+    the environment variable URLGRABBER_DEBUG is set, but can also be
+    called by a calling program.  Basically, if the calling program uses
+    the logging module and would like to incorporate urlgrabber logging,
+    then it can do so this way.  It's probably not necessary as most
+    internal logging is only for debugging purposes.
+
+    The passed-in object should be a logging.Logger instance.  It will
+    be pushed into the keepalive and byterange modules if they're
+    being used.  The mirror module pulls this object in on import, so
+    you will need to manually push into it.  In fact, you may find it
+    tidier to simply push your logging object (or objects) into each
+    of these modules independently.
+    """
+
+    global DEBUG
+    DEBUG = DBOBJ
+    if have_keepalive and keepalive.DEBUG is None:
+        keepalive.DEBUG = DBOBJ
+    if have_range and byterange.DEBUG is None:
+        byterange.DEBUG = DBOBJ
+    if sslfactory.DEBUG is None:
+        sslfactory.DEBUG = DBOBJ
+
+def _init_default_logger(logspec=None):
+    '''Examines the environment variable URLGRABBER_DEBUG and creates
+    a logging object (logging.logger) based on the contents.  It takes
+    the form
+
+      URLGRABBER_DEBUG=level,filename
+      
+    where "level" can be either an integer or a log level from the
+    logging module (DEBUG, INFO, etc).  If the integer is zero or
+    less, logging will be disabled.  Filename is the filename where
+    logs will be sent.  If it is "-", then stdout will be used.  If
+    the filename is empty or missing, stderr will be used.  If the
+    variable cannot be processed or the logging module cannot be
+    imported (python < 2.3) then logging will be disabled.  Here are
+    some examples:
+
+      URLGRABBER_DEBUG=1,debug.txt   # log everything to debug.txt
+      URLGRABBER_DEBUG=WARNING,-     # log warning and higher to stdout
+      URLGRABBER_DEBUG=INFO          # log info and higher to stderr
+      
+    This funtion is called during module initialization.  It is not
+    intended to be called from outside.  The only reason it is a
+    function at all is to keep the module-level namespace tidy and to
+    collect the code into a nice block.'''
+
+    try:
+        if logspec is None:
+            logspec = os.environ['URLGRABBER_DEBUG']
+        dbinfo = logspec.split(',')
+        import logging
+        level = logging._levelNames.get(dbinfo[0], None)
+        if level is None: level = int(dbinfo[0])
+        if level < 1: raise ValueError()
+
+        formatter = logging.Formatter('%(asctime)s %(message)s')
+        if len(dbinfo) > 1: filename = dbinfo[1]
+        else: filename = ''
+        if filename == '': handler = logging.StreamHandler(sys.stderr)
+        elif filename == '-': handler = logging.StreamHandler(sys.stdout)
+        else:  handler = logging.FileHandler(filename)
+        handler.setFormatter(formatter)
+        DBOBJ = logging.getLogger('urlgrabber')
+        DBOBJ.addHandler(handler)
+        DBOBJ.setLevel(level)
+    except (KeyError, ImportError, ValueError):
+        DBOBJ = None
+    set_logger(DBOBJ)
+
+def _log_package_state():
+    if not DEBUG: return
+    DEBUG.info('urlgrabber version  = %s' % __version__)
+    DEBUG.info('have_m2crypto       = %s' % sslfactory.have_m2crypto)
+    DEBUG.info('trans function "_"  = %s' % _)
+    DEBUG.info('have_keepalive      = %s' % have_keepalive)
+    DEBUG.info('have_range          = %s' % have_range)
+    DEBUG.info('have_socket_timeout = %s' % have_socket_timeout)
+
+_init_default_logger()
+_log_package_state()
+########################################################################
+#                 END MODULE INITIALIZATION
+########################################################################
+
+
+
+class URLGrabError(IOError):
+    """
+    URLGrabError error codes:
+
+      URLGrabber error codes (0 -- 255)
+        0    - everything looks good (you should never see this)
+        1    - malformed url
+        2    - local file doesn't exist
+        3    - request for non-file local file (dir, etc)
+        4    - IOError on fetch
+        5    - OSError on fetch
+        6    - no content length header when we expected one
+        7    - HTTPException
+        8    - Exceeded read limit (for urlread)
+        9    - Requested byte range not satisfiable.
+        10   - Byte range requested, but range support unavailable
+        11   - Illegal reget mode
+        12   - Socket timeout
+        13   - malformed proxy url
+        14   - HTTPError (includes .code and .exception attributes)
+        15   - user abort
+        16   - error writing to local file
+        
+      MirrorGroup error codes (256 -- 511)
+        256  - No more mirrors left to try
+
+      Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
+        [ this range reserved for application-specific error codes ]
+
+      Retry codes (< 0)
+        -1   - retry the download, unknown reason
+
+    Note: to test which group a code is in, you can simply do integer
+    division by 256: e.errno / 256
+
+    Negative codes are reserved for use by functions passed in to
+    retrygrab with checkfunc.  The value -1 is built in as a generic
+    retry code and is already included in the retrycodes list.
+    Therefore, you can create a custom check function that simply
+    returns -1 and the fetch will be re-tried.  For more customized
+    retries, you can use other negative number and include them in
+    retry-codes.  This is nice for outputting useful messages about
+    what failed.
+
+    You can use these error codes like so:
+      try: urlgrab(url)
+      except URLGrabError, e:
+         if e.errno == 3: ...
+           # or
+         print e.strerror
+           # or simply
+         print e  #### print '[Errno %i] %s' % (e.errno, e.strerror)
+    """
+    def __init__(self, *args):
+        IOError.__init__(self, *args)
+        self.url = "No url specified"
+
+class CallbackObject:
+    """Container for returned callback data.
+
+    This is currently a dummy class into which urlgrabber can stuff
+    information for passing to callbacks.  This way, the prototype for
+    all callbacks is the same, regardless of the data that will be
+    passed back.  Any function that accepts a callback function as an
+    argument SHOULD document what it will define in this object.
+
+    It is possible that this class will have some greater
+    functionality in the future.
+    """
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+
+def urlgrab(url, filename=None, **kwargs):
+    """grab the file at <url> and make a local copy at <filename>
+    If filename is none, the basename of the url is used.
+    urlgrab returns the filename of the local file, which may be different
+    from the passed-in filename if the copy_local kwarg == 0.
+    
+    See module documentation for a description of possible kwargs.
+    """
+    return default_grabber.urlgrab(url, filename, **kwargs)
+
+def urlopen(url, **kwargs):
+    """open the url and return a file object
+    If a progress object or throttle specifications exist, then
+    a special file object will be returned that supports them.
+    The file object can be treated like any other file object.
+    
+    See module documentation for a description of possible kwargs.
+    """
+    return default_grabber.urlopen(url, **kwargs)
+
+def urlread(url, limit=None, **kwargs):
+    """read the url into a string, up to 'limit' bytes
+    If the limit is exceeded, an exception will be thrown.  Note that urlread
+    is NOT intended to be used as a way of saying "I want the first N bytes"
+    but rather 'read the whole file into memory, but don't use too much'
+    
+    See module documentation for a description of possible kwargs.
+    """
+    return default_grabber.urlread(url, limit, **kwargs)
+
+
+class URLParser:
+    """Process the URLs before passing them to urllib2.
+
+    This class does several things:
+
+      * add any prefix
+      * translate a "raw" file to a proper file: url
+      * handle any http or https auth that's encoded within the url
+      * quote the url
+
+    Only the "parse" method is called directly, and it calls sub-methods.
+
+    An instance of this class is held in the options object, which
+    means that it's easy to change the behavior by sub-classing and
+    passing the replacement in.  It need only have a method like:
+
+        url, parts = urlparser.parse(url, opts)
+    """
+
+    def parse(self, url, opts):
+        """parse the url and return the (modified) url and its parts
+
+        Note: a raw file WILL be quoted when it's converted to a URL.
+        However, other urls (ones which come with a proper scheme) may
+        or may not be quoted according to opts.quote
+
+          opts.quote = 1     --> quote it
+          opts.quote = 0     --> do not quote it
+          opts.quote = None  --> guess
+        """
+        quote = opts.quote
+        
+        if opts.prefix:
+            url = self.add_prefix(url, opts.prefix)
+            
+        parts = urlparse.urlparse(url)
+        (scheme, host, path, parm, query, frag) = parts
+
+        if not scheme or (len(scheme) == 1 and scheme in string.letters):
+            # if a scheme isn't specified, we guess that it's "file:"
+            if url[0] not in '/\\': url = os.path.abspath(url)
+            url = 'file:' + urllib.pathname2url(url)
+            parts = urlparse.urlparse(url)
+            quote = 0 # pathname2url quotes, so we won't do it again
+            
+        if scheme in ['http', 'https']:
+            parts = self.process_http(parts)
+            
+        if quote is None:
+            quote = self.guess_should_quote(parts)
+        if quote:
+            parts = self.quote(parts)
+        
+        url = urlparse.urlunparse(parts)
+        return url, parts
+
+    def add_prefix(self, url, prefix):
+        if prefix[-1] == '/' or url[0] == '/':
+            url = prefix + url
+        else:
+            url = prefix + '/' + url
+        return url
+
+    def process_http(self, parts):
+        (scheme, host, path, parm, query, frag) = parts
+
+        if '@' in host and auth_handler:
+            try:
+                user_pass, host = host.split('@', 1)
+                if ':' in user_pass:
+                    user, password = user_pass.split(':', 1)
+            except ValueError, e:
+                err = URLGrabError(1, _('Bad URL: %s') % url)
+                err.url = url
+                raise err
+            if DEBUG: DEBUG.info('adding HTTP auth: %s, %s', user, password)
+            auth_handler.add_password(None, host, user, password)
+
+        return (scheme, host, path, parm, query, frag)
+
+    def quote(self, parts):
+        """quote the URL
+
+        This method quotes ONLY the path part.  If you need to quote
+        other parts, you should override this and pass in your derived
+        class.  The other alternative is to quote other parts before
+        passing into urlgrabber.
+        """
+        (scheme, host, path, parm, query, frag) = parts
+        path = urllib.quote(path)
+        return (scheme, host, path, parm, query, frag)
+
+    hexvals = '0123456789ABCDEF'
+    def guess_should_quote(self, parts):
+        """
+        Guess whether we should quote a path.  This amounts to
+        guessing whether it's already quoted.
+
+        find ' '   ->  1
+        find '%'   ->  1
+        find '%XX' ->  0
+        else       ->  1
+        """
+        (scheme, host, path, parm, query, frag) = parts
+        if ' ' in path:
+            return 1
+        ind = string.find(path, '%')
+        if ind > -1:
+            while ind > -1:
+                if len(path) < ind+3:
+                    return 1
+                code = path[ind+1:ind+3].upper()
+                if     code[0] not in self.hexvals or \
+                       code[1] not in self.hexvals:
+                    return 1
+                ind = string.find(path, '%', ind+1)
+            return 0
+        return 1
+    
+class URLGrabberOptions:
+    """Class to ease kwargs handling."""
+
+    def __init__(self, delegate=None, **kwargs):
+        """Initialize URLGrabberOptions object.
+        Set default values for all options and then update options specified
+        in kwargs.
+        """
+        self.delegate = delegate
+        if delegate is None:
+            self._set_defaults()
+        self._set_attributes(**kwargs)
+    
+    def __getattr__(self, name):
+        if self.delegate and hasattr(self.delegate, name):
+            return getattr(self.delegate, name)
+        raise AttributeError, name
+    
+    def raw_throttle(self):
+        """Calculate raw throttle value from throttle and bandwidth 
+        values.
+        """
+        if self.throttle <= 0:  
+            return 0
+        elif type(self.throttle) == type(0): 
+            return float(self.throttle)
+        else: # throttle is a float
+            return self.bandwidth * self.throttle
+        
+    def derive(self, **kwargs):
+        """Create a derived URLGrabberOptions instance.
+        This method creates a new instance and overrides the
+        options specified in kwargs.
+        """
+        return URLGrabberOptions(delegate=self, **kwargs)
+        
+    def _set_attributes(self, **kwargs):
+        """Update object attributes with those provided in kwargs."""
+        self.__dict__.update(kwargs)
+        if have_range and kwargs.has_key('range'):
+            # normalize the supplied range value
+            self.range = range_tuple_normalize(self.range)
+        if not self.reget in [None, 'simple', 'check_timestamp']:
+            raise URLGrabError(11, _('Illegal reget mode: %s') \
+                               % (self.reget, ))
+
+    def _set_defaults(self):
+        """Set all options to their default values. 
+        When adding new options, make sure a default is
+        provided here.
+        """
+        self.progress_obj = None
+        self.throttle = 1.0
+        self.bandwidth = 0
+        self.retry = None
+        self.retrycodes = [-1,2,4,5,6,7]
+        self.checkfunc = None
+        self.copy_local = 0
+        self.close_connection = 0
+        self.range = None
+        self.user_agent = 'urlgrabber/%s' % __version__
+        self.keepalive = 1
+        self.proxies = None
+        self.reget = None
+        self.failure_callback = None
+        self.interrupt_callback = None
+        self.prefix = None
+        self.opener = None
+        self.cache_openers = True
+        self.timeout = None
+        self.text = None
+        self.http_headers = None
+        self.ftp_headers = None
+        self.data = None
+        self.urlparser = URLParser()
+        self.quote = None
+        self.ssl_ca_cert = None
+        self.ssl_context = None
+
+    def __repr__(self):
+        return self.format()
+        
+    def format(self, indent='  '):
+        keys = self.__dict__.keys()
+        if self.delegate is not None:
+            keys.remove('delegate')
+        keys.sort()
+        s = '{\n'
+        for k in keys:
+            s = s + indent + '%-15s: %s,\n' % \
+                (repr(k), repr(self.__dict__[k]))
+        if self.delegate:
+            df = self.delegate.format(indent + '  ')
+            s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
+        s = s + indent + '}'
+        return s
+
+class URLGrabber:
+    """Provides easy opening of URLs with a variety of options.
+    
+    All options are specified as kwargs. Options may be specified when
+    the class is created and may be overridden on a per request basis.
+    
+    New objects inherit default values from default_grabber.
+    """
+    
+    def __init__(self, **kwargs):
+        self.opts = URLGrabberOptions(**kwargs)
+    
+    def _retry(self, opts, func, *args):
+        tries = 0
+        while 1:
+            # there are only two ways out of this loop.  The second has
+            # several "sub-ways"
+            #   1) via the return in the "try" block
+            #   2) by some exception being raised
+            #      a) an excepton is raised that we don't "except"
+            #      b) a callback raises ANY exception
+            #      c) we're not retry-ing or have run out of retries
+            #      d) the URLGrabError code is not in retrycodes
+            # beware of infinite loops :)
+            tries = tries + 1
+            exception = None
+            retrycode = None
+            callback  = None
+            if DEBUG: DEBUG.info('attempt %i/%s: %s',
+                                 tries, opts.retry, args[0])
+            try:
+                r = apply(func, (opts,) + args, {})
+                if DEBUG: DEBUG.info('success')
+                return r
+            except URLGrabError, e:
+                exception = e
+                callback = opts.failure_callback
+                retrycode = e.errno
+            except KeyboardInterrupt, e:
+                exception = e
+                callback = opts.interrupt_callback
+
+            if DEBUG: DEBUG.info('exception: %s', exception)
+            if callback:
+                if DEBUG: DEBUG.info('calling callback: %s', callback)
+                cb_func, cb_args, cb_kwargs = self._make_callback(callback)
+                obj = CallbackObject(exception=exception, url=args[0],
+                                     tries=tries, retry=opts.retry)
+                cb_func(obj, *cb_args, **cb_kwargs)
+
+            if (opts.retry is None) or (tries == opts.retry):
+                if DEBUG: DEBUG.info('retries exceeded, re-raising')
+                raise
+
+            if (retrycode is not None) and (retrycode not in opts.retrycodes):
+                if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
+                                     retrycode, opts.retrycodes)
+                raise
+    
+    def urlopen(self, url, **kwargs):
+        """open the url and return a file object
+        If a progress object or throttle value specified when this 
+        object was created, then  a special file object will be 
+        returned that supports them. The file object can be treated 
+        like any other file object.
+        """
+        opts = self.opts.derive(**kwargs)
+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+        (url,parts) = opts.urlparser.parse(url, opts) 
+        def retryfunc(opts, url):
+            return PyCurlFileObject(url, filename=None, opts=opts)
+        return self._retry(opts, retryfunc, url)
+    
+    def urlgrab(self, url, filename=None, **kwargs):
+        """grab the file at <url> and make a local copy at <filename>
+        If filename is none, the basename of the url is used.
+        urlgrab returns the filename of the local file, which may be 
+        different from the passed-in filename if copy_local == 0.
+        """
+        opts = self.opts.derive(**kwargs)
+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+        (url,parts) = opts.urlparser.parse(url, opts) 
+        (scheme, host, path, parm, query, frag) = parts
+        if filename is None:
+            filename = os.path.basename( urllib.unquote(path) )
+        if scheme == 'file' and not opts.copy_local:
+            # just return the name of the local file - don't make a 
+            # copy currently
+            path = urllib.url2pathname(path)
+            if host:
+                path = os.path.normpath('//' + host + path)
+            if not os.path.exists(path):
+                err = URLGrabError(2, 
+                      _('Local file does not exist: %s') % (path, ))
+                err.url = url
+                raise err
+            elif not os.path.isfile(path):
+                err = URLGrabError(3, 
+                                 _('Not a normal file: %s') % (path, ))
+                err.url = url
+                raise err
+
+            elif not opts.range:
+                if not opts.checkfunc is None:
+                    cb_func, cb_args, cb_kwargs = \
+                       self._make_callback(opts.checkfunc)
+                    obj = CallbackObject()
+                    obj.filename = path
+                    obj.url = url
+                    apply(cb_func, (obj, )+cb_args, cb_kwargs)        
+                return path
+        
+        def retryfunc(opts, url, filename):
+            fo = PyCurlFileObject(url, filename, opts)
+            try:
+                fo._do_grab()
+                if not opts.checkfunc is None:
+                    cb_func, cb_args, cb_kwargs = \
+                             self._make_callback(opts.checkfunc)
+                    obj = CallbackObject()
+                    obj.filename = filename
+                    obj.url = url
+                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
+            finally:
+                fo.close()
+            return filename
+        
+        return self._retry(opts, retryfunc, url, filename)
+    
+    def urlread(self, url, limit=None, **kwargs):
+        """read the url into a string, up to 'limit' bytes
+        If the limit is exceeded, an exception will be thrown.  Note
+        that urlread is NOT intended to be used as a way of saying 
+        "I want the first N bytes" but rather 'read the whole file 
+        into memory, but don't use too much'
+        """
+        opts = self.opts.derive(**kwargs)
+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+        (url,parts) = opts.urlparser.parse(url, opts) 
+        if limit is not None:
+            limit = limit + 1
+            
+        def retryfunc(opts, url, limit):
+            fo = PyCurlFileObject(url, filename=None, opts=opts)
+            s = ''
+            try:
+                # this is an unfortunate thing.  Some file-like objects
+                # have a default "limit" of None, while the built-in (real)
+                # file objects have -1.  They each break the other, so for
+                # now, we just force the default if necessary.
+                if limit is None: s = fo.read()
+                else: s = fo.read(limit)
+
+                if not opts.checkfunc is None:
+                    cb_func, cb_args, cb_kwargs = \
+                             self._make_callback(opts.checkfunc)
+                    obj = CallbackObject()
+                    obj.data = s
+                    obj.url = url
+                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
+            finally:
+                fo.close()
+            return s
+            
+        s = self._retry(opts, retryfunc, url, limit)
+        if limit and len(s) > limit:
+            err = URLGrabError(8, 
+                               _('Exceeded limit (%i): %s') % (limit, url))
+            err.url = url
+            raise err
+
+        return s
+        
+    def _make_callback(self, callback_obj):
+        if callable(callback_obj):
+            return callback_obj, (), {}
+        else:
+            return callback_obj
+
+# create the default URLGrabber used by urlXXX functions.
+# NOTE: actual defaults are set in URLGrabberOptions
+default_grabber = URLGrabber()
+
+class URLGrabberFileObject:
+    """This is a file-object wrapper that supports progress objects 
+    and throttling.
+
+    This exists to solve the following problem: lets say you want to
+    drop-in replace a normal open with urlopen.  You want to use a
+    progress meter and/or throttling, but how do you do that without
+    rewriting your code?  Answer: urlopen will return a wrapped file
+    object that does the progress meter and-or throttling internally.
+    """
+
+    def __init__(self, url, filename, opts):
+        self.url = url
+        self.filename = filename
+        self.opts = opts
+        self.fo = None
+        self._rbuf = ''
+        self._rbufsize = 1024*8
+        self._ttime = time.time()
+        self._tsize = 0
+        self._amount_read = 0
+        self._opener = None
+        self._do_open()
+        
+    def __getattr__(self, name):
+        """This effectively allows us to wrap at the instance level.
+        Any attribute not found in _this_ object will be searched for
+        in self.fo.  This includes methods."""
+        if hasattr(self.fo, name):
+            return getattr(self.fo, name)
+        raise AttributeError, name
+   
+    def _get_opener(self):
+        """Build a urllib2 OpenerDirector based on request options."""
+        if self.opts.opener:
+            return self.opts.opener
+        elif self._opener is None:
+            handlers = []
+            need_keepalive_handler = (have_keepalive and self.opts.keepalive)
+            need_range_handler = (range_handlers and \
+                                  (self.opts.range or self.opts.reget))
+            # if you specify a ProxyHandler when creating the opener
+            # it _must_ come before all other handlers in the list or urllib2
+            # chokes.
+            if self.opts.proxies:
+                handlers.append( _proxy_handler_cache.get(self.opts.proxies) )
+
+                # -------------------------------------------------------
+                # OK, these next few lines are a serious kludge to get
+                # around what I think is a bug in python 2.2's
+                # urllib2.  The basic idea is that default handlers
+                # get applied first.  If you override one (like a
+                # proxy handler), then the default gets pulled, but
+                # the replacement goes on the end.  In the case of
+                # proxies, this means the normal handler picks it up
+                # first and the proxy isn't used.  Now, this probably
+                # only happened with ftp or non-keepalive http, so not
+                # many folks saw it.  The simple approach to fixing it
+                # is just to make sure you override the other
+                # conflicting defaults as well.  I would LOVE to see
+                # these go way or be dealt with more elegantly.  The
+                # problem isn't there after 2.2.  -MDS 2005/02/24
+                if not need_keepalive_handler:
+                    handlers.append( urllib2.HTTPHandler() )
+                if not need_range_handler:
+                    handlers.append( urllib2.FTPHandler() )
+                # -------------------------------------------------------
+
+
+            ssl_factory = _ssl_factory_cache.get( (self.opts.ssl_ca_cert,
+                                                   self.opts.ssl_context) )
+            if need_keepalive_handler:
+                handlers.append(keepalive_http_handler)
+                handlers.append(_https_handler_cache.get(ssl_factory))
+            if need_range_handler:
+                handlers.extend( range_handlers )
+            handlers.append( auth_handler )
+            if self.opts.cache_openers:
+                self._opener = _opener_cache.get([ssl_factory,] + handlers)
+            else:
+                self._opener = _opener_cache.create([ssl_factory,] + handlers)
+            # OK, I don't like to do this, but otherwise, we end up with
+            # TWO user-agent headers.
+            self._opener.addheaders = []
+        return self._opener
+        
+    def _do_open(self):
+        opener = self._get_opener()
+
+        req = urllib2.Request(self.url, self.opts.data) # build request object
+        self._add_headers(req) # add misc headers that we need
+        self._build_range(req) # take care of reget and byterange stuff
+
+        fo, hdr = self._make_request(req, opener)
+        if self.reget_time and self.opts.reget == 'check_timestamp':
+            # do this if we have a local file with known timestamp AND
+            # we're in check_timestamp reget mode.
+            fetch_again = 0
+            try:
+                modified_tuple  = hdr.getdate_tz('last-modified')
+                modified_stamp  = rfc822.mktime_tz(modified_tuple)
+                if modified_stamp > self.reget_time: fetch_again = 1
+            except (TypeError,):
+                fetch_again = 1
+            
+            if fetch_again:
+                # the server version is newer than the (incomplete) local
+                # version, so we should abandon the version we're getting
+                # and fetch the whole thing again.
+                fo.close()
+                self.opts.reget = None
+                del req.headers['Range']
+                self._build_range(req)
+                fo, hdr = self._make_request(req, opener)
+
+        (scheme, host, path, parm, query, frag) = urlparse.urlparse(self.url)
+        path = urllib.unquote(path)
+        if not (self.opts.progress_obj or self.opts.raw_throttle() \
+                or self.opts.timeout):
+            # if we're not using the progress_obj, throttling, or timeout
+            # we can get a performance boost by going directly to
+            # the underlying fileobject for reads.
+            self.read = fo.read
+            if hasattr(fo, 'readline'):
+                self.readline = fo.readline
+        elif self.opts.progress_obj:
+            try:    
+                length = int(hdr['Content-Length'])
+                length = length + self._amount_read     # Account for regets
+            except (KeyError, ValueError, TypeError): 
+                length = None
+
+            self.opts.progress_obj.start(str(self.filename),
+                                         urllib.unquote(self.url),
+                                         os.path.basename(path), 
+                                         length, text=self.opts.text)
+            self.opts.progress_obj.update(0)
+        (self.fo, self.hdr) = (fo, hdr)
+    
+    def _add_headers(self, req):
+        if self.opts.user_agent:
+            req.add_header('User-agent', self.opts.user_agent)
+        try: req_type = req.get_type()
+        except ValueError: req_type = None
+        if self.opts.http_headers and req_type in ('http', 'https'):
+            for h, v in self.opts.http_headers:
+                req.add_header(h, v)
+        if self.opts.ftp_headers and req_type == 'ftp':
+            for h, v in self.opts.ftp_headers:
+                req.add_header(h, v)
+
+    def _build_range(self, req):
+        self.reget_time = None
+        self.append = 0
+        reget_length = 0
+        rt = None
+        if have_range and self.opts.reget and type(self.filename) == type(''):
+            # we have reget turned on and we're dumping to a file
+            try:
+                s = os.stat(self.filename)
+            except OSError:
+                pass
+            else:
+                self.reget_time = s[ST_MTIME]
+                reget_length = s[ST_SIZE]
+
+                # Set initial length when regetting
+                self._amount_read = reget_length    
+
+                rt = reget_length, ''
+                self.append = 1
+                
+        if self.opts.range:
+            if not have_range:
+                err = URLGrabError(10, _('Byte range requested but range '\
+                                         'support unavailable %s') % self.url)
+                err.url = self.url
+                raise err
+
+            rt = self.opts.range
+            if rt[0]: rt = (rt[0] + reget_length, rt[1])
+
+        if rt:
+            header = range_tuple_to_header(rt)
+            if header: req.add_header('Range', header)
+
+    def _make_request(self, req, opener):
+        try:
+            if have_socket_timeout and self.opts.timeout:
+                old_to = socket.getdefaulttimeout()
+                socket.setdefaulttimeout(self.opts.timeout)
+                try:
+                    fo = opener.open(req)
+                finally:
+                    socket.setdefaulttimeout(old_to)
+            else:
+                fo = opener.open(req)
+            hdr = fo.info()
+        except ValueError, e:
+            err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
+            err.url = self.url
+            raise err
+
+        except RangeError, e:
+            err = URLGrabError(9, _('%s on %s') % (e, self.url))
+            err.url = self.url
+            raise err
+        except urllib2.HTTPError, e:
+            new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
+            new_e.code = e.code
+            new_e.exception = e
+            new_e.url = self.url
+            raise new_e
+        except IOError, e:
+            if hasattr(e, 'reason') and have_socket_timeout and \
+                   isinstance(e.reason, TimeoutError):
+                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+            else:
+                err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+        except OSError, e:
+            err = URLGrabError(5, _('%s on %s') % (e, self.url))
+            err.url = self.url
+            raise err
+
+        except HTTPException, e:
+            err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
+                            (e.__class__.__name__, self.url, e))
+            err.url = self.url
+            raise err
+
+        else:
+            return (fo, hdr)
+        
+    def _do_grab(self):
+        """dump the file to self.filename."""
+        if self.append: mode = 'ab'
+        else: mode = 'wb'
+        if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+                             (self.filename, mode))
+        try:
+            new_fo = open(self.filename, mode)
+        except IOError, e:
+            err = URLGrabError(16, _(\
+              'error opening local file from %s, IOError: %s') % (self.url, e))
+            err.url = self.url
+            raise err
+
+        try:
+            # if we have a known range, only try to read that much.
+            (low, high) = self.opts.range
+            amount = high - low
+        except TypeError, ValueError:
+            amount = None
+        bs = 1024*8
+        size = 0
+
+        if amount is not None: bs = min(bs, amount - size)
+        block = self.read(bs)
+        size = size + len(block)
+        while block:
+            try:
+                new_fo.write(block)
+            except IOError, e:
+                err = URLGrabError(16, _(\
+                 'error writing to local file from %s, IOError: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+            if amount is not None: bs = min(bs, amount - size)
+            block = self.read(bs)
+            size = size + len(block)
+
+        new_fo.close()
+        try:
+            modified_tuple  = self.hdr.getdate_tz('last-modified')
+            modified_stamp  = rfc822.mktime_tz(modified_tuple)
+            os.utime(self.filename, (modified_stamp, modified_stamp))
+        except (TypeError,), e: pass
+
+        return size
+    
+    def _fill_buffer(self, amt=None):
+        """fill the buffer to contain at least 'amt' bytes by reading
+        from the underlying file object.  If amt is None, then it will
+        read until it gets nothing more.  It updates the progress meter
+        and throttles after every self._rbufsize bytes."""
+        # the _rbuf test is only in this first 'if' for speed.  It's not
+        # logically necessary
+        if self._rbuf and not amt is None:
+            L = len(self._rbuf)
+            if amt > L:
+                amt = amt - L
+            else:
+                return
+
+        # if we've made it here, then we don't have enough in the buffer
+        # and we need to read more.
+
+        buf = [self._rbuf]
+        bufsize = len(self._rbuf)
+        while amt is None or amt:
+            # first, delay if necessary for throttling reasons
+            if self.opts.raw_throttle():
+                diff = self._tsize/self.opts.raw_throttle() - \
+                       (time.time() - self._ttime)
+                if diff > 0: time.sleep(diff)
+                self._ttime = time.time()
+                
+            # now read some data, up to self._rbufsize
+            if amt is None: readamount = self._rbufsize
+            else:           readamount = min(amt, self._rbufsize)
+            try:
+                new = self.fo.read(readamount)
+            except socket.error, e:
+                err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+            except TimeoutError, e:
+                raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+            except IOError, e:
+                raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
+                err.url = self.url
+                raise err
+
+            newsize = len(new)
+            if not newsize: break # no more to read
+
+            if amt: amt = amt - newsize
+            buf.append(new)
+            bufsize = bufsize + newsize
+            self._tsize = newsize
+            self._amount_read = self._amount_read + newsize
+            if self.opts.progress_obj:
+                self.opts.progress_obj.update(self._amount_read)
+
+        self._rbuf = string.join(buf, '')
+        return
+
+    def read(self, amt=None):
+        self._fill_buffer(amt)
+        if amt is None:
+            s, self._rbuf = self._rbuf, ''
+        else:
+            s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
+        return s
+
+    def readline(self, limit=-1):
+        i = string.find(self._rbuf, '\n')
+        while i < 0 and not (0 < limit <= len(self._rbuf)):
+            L = len(self._rbuf)
+            self._fill_buffer(L + self._rbufsize)
+            if not len(self._rbuf) > L: break
+            i = string.find(self._rbuf, '\n', L)
+
+        if i < 0: i = len(self._rbuf)
+        else: i = i+1
+        if 0 <= limit < len(self._rbuf): i = limit
+
+        s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
+        return s
+
+    def close(self):
+        if self.opts.progress_obj:
+            self.opts.progress_obj.end(self._amount_read)
+        self.fo.close()
+        if self.opts.close_connection:
+            try: self.fo.close_connection()
+            except: pass
+
+
+class PyCurlFileObject():
+    def __init__(self, url, filename, opts):
+        self.fo = None
+        self._hdr_dump = ''
+        self._parsed_hdr = None
+        self.url = url
+        self.scheme = urlparse.urlsplit(self.url)[0]
+        self.filename = filename
+        self.append = False
+        self.opts = opts
+        self._complete = False
+        self.reget_time = None
+        self._rbuf = ''
+        self._rbufsize = 1024*8
+        self._ttime = time.time()
+        self._tsize = 0
+        self._amount_read = 0
+        self._reget_length = 0
+        self._prog_running = False
+        self.size = 0
+        self._do_open()
+        
+        
+    def __getattr__(self, name):
+        """This effectively allows us to wrap at the instance level.
+        Any attribute not found in _this_ object will be searched for
+        in self.fo.  This includes methods."""
+
+        if hasattr(self.fo, name):
+            return getattr(self.fo, name)
+        raise AttributeError, name
+
+    def _retrieve(self, buf):
+        if not self._prog_running:
+            if self.opts.progress_obj:
+                size  = self.size + self._reget_length
+                self.opts.progress_obj.start(self._prog_reportname, 
+                                             urllib.unquote(self.url), 
+                                             self._prog_basename, 
+                                             size=size,
+                                             text=self.opts.text)
+                self._prog_running = True
+                self.opts.progress_obj.update(self._amount_read)
+
+        self._amount_read += len(buf)
+        self.fo.write(buf)
+        return len(buf)
+    
+    def _hdr_retrieve(self, buf):
+        self._hdr_dump += buf
+        # we have to get the size before we do the progress obj start
+        # but we can't do that w/o making it do 2 connects, which sucks
+        # so we cheat and stuff it in here in the hdr_retrieve
+        if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
+            length = buf.split(':')[1]
+            self.size = int(length)
+        elif self.scheme in ['ftp']:
+            s = None
+            if buf.startswith('213 '):
+                s = buf[3:].strip()
+            elif buf.startswith('150 '):
+                s = parse150(buf)
+            if s:
+                self.size = s
+        
+        return len(buf)
+
+    def _return_hdr_obj(self):
+        if self._parsed_hdr:
+            return self._parsed_hdr
+        statusend = self._hdr_dump.find('\n')
+        hdrfp = StringIO()
+        hdrfp.write(self._hdr_dump[statusend:])
+        self._parsed_hdr =  mimetools.Message(hdrfp)
+        return self._parsed_hdr
+    
+    hdr = property(_return_hdr_obj)
+    http_code = property(fget=
+                 lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE))
+
+    def _set_opts(self, opts={}):
+        # XXX
+        if not opts:
+            opts = self.opts
+
+
+        # defaults we're always going to set
+        self.curl_obj.setopt(pycurl.NOPROGRESS, 0)
+        self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
+        self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
+        self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
+        self.curl_obj.setopt(pycurl.FAILONERROR, 1)
+        
+        if DEBUG:
+            self.curl_obj.setopt(pycurl.VERBOSE, True)
+        if opts.user_agent:
+            self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
+        
+        # maybe to be options later
+        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, 1)
+        self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
+        self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, 30)
+        
+        # timeouts
+        timeout = 300
+        if opts.timeout:
+            timeout = int(opts.timeout)
+        self.curl_obj.setopt(pycurl.TIMEOUT, timeout)
+        # ssl options
+        if self.scheme == 'https':
+            if opts.ssl_ca_cert: # this may do ZERO with nss  according to curl docs
+                self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
+
+        #headers:
+        if opts.http_headers and self.scheme in ('http', 'https'):
+            headers = []
+            for (tag, content) in opts.http_headers:
+                headers.append('%s:%s' % (tag, content))
+            self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
+
+        # ranges:
+        if opts.range or opts.reget:
+            range_str = self._build_range()
+            if range_str:
+                self.curl_obj.setopt(pycurl.RANGE, range_str)
+            
+        # throttle/bandwidth
+        if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
+            self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
+            
+        # proxy settings
+        if opts.proxies:
+            for (scheme, proxy) in opts.proxies.items():
+                if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
+                    if scheme not in ('ftp'):
+                        continue
+                    else:
+                        self.curl_obj.setopt(pycurl.PROXY, proxy)
+                elif self.scheme in ('http', 'https'):
+                    if scheme not in ('http', 'https'):
+                        continue
+                    else:
+                        self.curl_obj.setopt(pycurl.PROXY, proxy)
+        
+        # username/password/auth settings
+
+        #posts - simple - expects the fields as they are
+        if opts.data:
+            self.curl_obj.setopt(pycurl.POST, True)
+            self.curl_obj.setopt(pycurl.POSTFIELDS, opts.data)
+            
+        # our url
+        self.curl_obj.setopt(pycurl.URL, self.url)
+        
+    
+    def _do_perform(self):
+        if self._complete:
+            return
+        
+        try:
+            self.curl_obj.perform()
+        except pycurl.error, e:
+            # XXX - break some of these out a bit more clearly
+            # to other URLGrabErrors from 
+            # http://curl.haxx.se/libcurl/c/libcurl-errors.html
+            # this covers e.args[0] == 22 pretty well - which will be common
+            if str(e.args[1]) == '': # fake it until you make it
+                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+            else:
+                msg = str(e.args[1])
+            err = URLGrabError(14, msg)
+            err.code = self.http_code
+            err.exception = e
+            raise err
+            
+    def _do_open(self):
+        self.append = False
+        self.reget_time = None
+        self.curl_obj = _curl_cache
+        self.curl_obj.reset() # reset all old settings away, just in case
+        # setup any ranges
+        self._set_opts()
+        self._do_grab()
+        return self.fo
+
+    def _add_headers(self):
+        pass
+        
+    def _build_range(self):
+        self.reget_time = None
+        self.append = False
+        reget_length = 0
+        rt = None
+        if self.opts.reget and type(self.filename) == type(''):
+            # we have reget turned on and we're dumping to a file
+            try:
+                s = os.stat(self.filename)
+            except OSError:
+                pass
+            else:
+                self.reget_time = s[ST_MTIME]
+                reget_length = s[ST_SIZE]
+
+                # Set initial length when regetting
+                self._amount_read = reget_length    
+                self._reget_length = reget_length # set where we started from, too
+
+                rt = reget_length, ''
+                self.append = 1
+                
+        if self.opts.range:
+            rt = self.opts.range
+            if rt[0]: rt = (rt[0] + reget_length, rt[1])
+
+        if rt:
+            header = range_tuple_to_header(rt)
+            if header:
+                return header.split('=')[1]
+
+
+
+    def _make_request(self, req, opener):
+        #XXXX
+        # This doesn't do anything really, but we could use this
+        # instead of do_open() to catch a lot of crap errors as 
+        # mstenner did before here
+        return (self.fo, self.hdr)
+        
+        try:
+            if have_socket_timeout and self.opts.timeout:
+                old_to = socket.getdefaulttimeout()
+                socket.setdefaulttimeout(self.opts.timeout)
+                try:
+                    fo = opener.open(req)
+                finally:
+                    socket.setdefaulttimeout(old_to)
+            else:
+                fo = opener.open(req)
+            hdr = fo.info()
+        except ValueError, e:
+            err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
+            err.url = self.url
+            raise err
+
+        except RangeError, e:
+            err = URLGrabError(9, _('%s on %s') % (e, self.url))
+            err.url = self.url
+            raise err
+        except urllib2.HTTPError, e:
+            new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
+            new_e.code = e.code
+            new_e.exception = e
+            new_e.url = self.url
+            raise new_e
+        except IOError, e:
+            if hasattr(e, 'reason') and have_socket_timeout and \
+                   isinstance(e.reason, TimeoutError):
+                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+            else:
+                err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+        except OSError, e:
+            err = URLGrabError(5, _('%s on %s') % (e, self.url))
+            err.url = self.url
+            raise err
+
+        except HTTPException, e:
+            err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
+                            (e.__class__.__name__, self.url, e))
+            err.url = self.url
+            raise err
+
+        else:
+            return (fo, hdr)
+        
+    def _do_grab(self):
+        """dump the file to a filename or StringIO buffer"""
+
+        if self._complete:
+            return
+
+        if self.filename:
+            self._prog_reportname = str(self.filename)
+            self._prog_basename = os.path.basename(self.filename)
+
+            if self.append: mode = 'ab'
+            else: mode = 'wb'
+
+            if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+                                 (self.filename, mode))
+            try:
+                self.fo = open(self.filename, mode)
+            except IOError, e:
+                err = URLGrabError(16, _(\
+                  'error opening local file from %s, IOError: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+        else:
+            self._prog_reportname = 'MEMORY'
+            self._prog_basename = 'MEMORY'
+            fh, self._temp_name = mkstemp()
+            
+            self.fo = open(self._temp_name, 'wb')
+
+            
+        self._do_perform()
+        
+
+        # close it up
+        self.fo.flush()
+        self.fo.close()
+
+        if self.filename:            
+            # set the time
+            mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+            if mod_time != -1:
+                os.utime(self.filename, (mod_time, mod_time))
+            # re open it
+            self.fo = open(self.filename, 'r')
+        else:
+            self.fo = open(self._temp_name, 'r')
+
+        self._complete = True
+    
+    def _fill_buffer(self, amt=None):
+        """fill the buffer to contain at least 'amt' bytes by reading
+        from the underlying file object.  If amt is None, then it will
+        read until it gets nothing more.  It updates the progress meter
+        and throttles after every self._rbufsize bytes."""
+        # the _rbuf test is only in this first 'if' for speed.  It's not
+        # logically necessary
+        if self._rbuf and not amt is None:
+            L = len(self._rbuf)
+            if amt > L:
+                amt = amt - L
+            else:
+                return
+
+        # if we've made it here, then we don't have enough in the buffer
+        # and we need to read more.
+        
+        if not self._complete: self._do_grab() #XXX cheater - change on ranges
+        
+        buf = [self._rbuf]
+        bufsize = len(self._rbuf)
+        while amt is None or amt:
+            # first, delay if necessary for throttling reasons
+            if self.opts.raw_throttle():
+                diff = self._tsize/self.opts.raw_throttle() - \
+                       (time.time() - self._ttime)
+                if diff > 0: time.sleep(diff)
+                self._ttime = time.time()
+                
+            # now read some data, up to self._rbufsize
+            if amt is None: readamount = self._rbufsize
+            else:           readamount = min(amt, self._rbufsize)
+            try:
+                new = self.fo.read(readamount)
+            except socket.error, e:
+                err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+            except TimeoutError, e:
+                raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+            except IOError, e:
+                raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
+                err.url = self.url
+                raise err
+
+            newsize = len(new)
+            if not newsize: break # no more to read
+
+            if amt: amt = amt - newsize
+            buf.append(new)
+            bufsize = bufsize + newsize
+            self._tsize = newsize
+            self._amount_read = self._amount_read + newsize
+            #if self.opts.progress_obj:
+            #    self.opts.progress_obj.update(self._amount_read)
+
+        self._rbuf = string.join(buf, '')
+        return
+
+    def _progress_update(self, download_total, downloaded, upload_total, uploaded):
+            if self._prog_running:
+                downloaded += self._reget_length
+                self.opts.progress_obj.update(downloaded)
+
+    def read(self, amt=None):
+        self._fill_buffer(amt)
+        if amt is None:
+            s, self._rbuf = self._rbuf, ''
+        else:
+            s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
+        return s
+
+    def readline(self, limit=-1):
+        if not self._complete: self._do_grab()
+        return self.fo.readline()
+        
+        i = string.find(self._rbuf, '\n')
+        while i < 0 and not (0 < limit <= len(self._rbuf)):
+            L = len(self._rbuf)
+            self._fill_buffer(L + self._rbufsize)
+            if not len(self._rbuf) > L: break
+            i = string.find(self._rbuf, '\n', L)
+
+        if i < 0: i = len(self._rbuf)
+        else: i = i+1
+        if 0 <= limit < len(self._rbuf): i = limit
+
+        s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
+        return s
+
+    def close(self):
+        if self._prog_running:
+            self.opts.progress_obj.end(self._amount_read)
+        self.fo.close()
+        
+        # XXX - confident that this does nothing for pycurl
+        #if self.opts.close_connection:
+        #    try: self.fo.close_connection()
+        #    except: pass
+
+
+
+#####################################################################
+
+
+
+class NoDefault: pass
+class ObjectCache:
+    def __init__(self, name=None):
+        self.name = name or self.__class__.__name__
+        self._lock = thread.allocate_lock()
+        self._cache = []
+
+    def lock(self):
+        self._lock.acquire()
+
+    def unlock(self):
+        self._lock.release()
+            
+    def get(self, key, create=None, found=None):
+        for (k, v) in self._cache:
+            if k == key:
+                if DEBUG:
+                    DEBUG.debug('%s: found key' % self.name)
+                    DEBUG.debug('%s: key = %s' % (self.name, key))
+                    DEBUG.debug('%s: val = %s' % (self.name, v))
+                found = found or getattr(self, 'found', None)
+                if found: v = found(key, v)
+                return v
+        if DEBUG:
+            DEBUG.debug('%s: no key found' % self.name)
+            DEBUG.debug('%s: key = %s' % (self.name, key))
+        create = create or getattr(self, 'create', None)
+        if create:
+            value = create(key)
+            if DEBUG:
+                DEBUG.info('%s: new value created' % self.name)
+                DEBUG.debug('%s: val = %s' % (self.name, value))
+            self._cache.append( (key, value) )
+            return value
+        else:
+            raise KeyError('key not found: %s' % key)
+
+    def set(self, key, value):
+        if DEBUG:
+            DEBUG.info('%s: inserting key' % self.name)
+            DEBUG.debug('%s: key = %s' % (self.name, key))
+            DEBUG.debug('%s: val = %s' % (self.name, value))
+        self._cache.append( (key, value) )
+
+    def ts_get(self, key, create=None, found=None):
+        self._lock.acquire()
+        try:
+            self.get(key, create, found)
+        finally:
+            self._lock.release()
+        
+    def ts_set(self, key, value):
+        self._lock.acquire()
+        try:
+            self.set(key, value)
+        finally:
+            self._lock.release()
+
+class OpenerCache(ObjectCache):
+    def found(self, factory_and_handlers, opener):
+        for handler in factory_and_handlers[1:]:
+            handler.add_parent(opener)
+        return opener
+    def create(self, factory_and_handlers):
+        factory = factory_and_handlers[0]
+        handlers = factory_and_handlers[1:]
+        return factory.create_opener(*handlers)
+_opener_cache = OpenerCache()
+
+_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
+
+class ProxyHandlerCache(ObjectCache):
+    def create(self, proxies):
+        for k, v in proxies.items():
+            utype, url = urllib.splittype(v)
+            host, other = urllib.splithost(url)
+            if (utype is None) or (host is None):
+                err = URLGrabError(13, _('Bad proxy URL: %s') % v)
+                err.url = url
+                raise err
+        return urllib2.ProxyHandler(proxies)
+_proxy_handler_cache = ProxyHandlerCache()
+
+class HTTPSHandlerCache(ObjectCache):
+    def create(self, ssl_factory):
+        return HTTPSHandler(ssl_factory)
+_https_handler_cache = HTTPSHandlerCache()
+
+class SSLFactoryCache(ObjectCache):
+    def create(self, cert_and_context):
+        return sslfactory.get_factory(*cert_and_context)
+_ssl_factory_cache = SSLFactoryCache()
+
+#####################################################################
+# DEPRECATED FUNCTIONS
+def set_throttle(new_throttle):
+    """Deprecated. Use: default_grabber.throttle = new_throttle"""
+    default_grabber.throttle = new_throttle
+
+def set_bandwidth(new_bandwidth):
+    """Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
+    default_grabber.bandwidth = new_bandwidth
+
+def set_progress_obj(new_progress_obj):
+    """Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
+    default_grabber.progress_obj = new_progress_obj
+
+def set_user_agent(new_user_agent):
+    """Deprecated. Use: default_grabber.user_agent = new_user_agent"""
+    default_grabber.user_agent = new_user_agent
+    
+def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+              progress_obj=None, throttle=None, bandwidth=None,
+              numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
+    """Deprecated. Use: urlgrab() with the retry arg instead"""
+    kwargs = {'copy_local' :  copy_local, 
+              'close_connection' : close_connection,
+              'progress_obj' : progress_obj, 
+              'throttle' : throttle, 
+              'bandwidth' : bandwidth,
+              'retry' : numtries,
+              'retrycodes' : retrycodes,
+              'checkfunc' : checkfunc 
+              }
+    return urlgrab(url, filename, **kwargs)
+
+        
+#####################################################################
+#  TESTING
+def _main_test():
+    import sys
+    try: url, filename = sys.argv[1:3]
+    except ValueError:
+        print 'usage:', sys.argv[0], \
+              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
+        sys.exit()
+
+    kwargs = {}
+    for a in sys.argv[3:]:
+        k, v = string.split(a, '=', 1)
+        kwargs[k] = int(v)
+
+    set_throttle(1.0)
+    set_bandwidth(32 * 1024)
+    print "throttle: %s,  throttle bandwidth: %s B/s" % (default_grabber.throttle, 
+                                                        default_grabber.bandwidth)
+
+    try: from progress import text_progress_meter
+    except ImportError, e: pass
+    else: kwargs['progress_obj'] = text_progress_meter()
+
+    try: name = apply(urlgrab, (url, filename), kwargs)
+    except URLGrabError, e: print e
+    else: print 'LOCAL FILE:', name
+
+
+def _retry_test():
+    import sys
+    try: url, filename = sys.argv[1:3]
+    except ValueError:
+        print 'usage:', sys.argv[0], \
+              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
+        sys.exit()
+
+    kwargs = {}
+    for a in sys.argv[3:]:
+        k, v = string.split(a, '=', 1)
+        kwargs[k] = int(v)
+
+    try: from progress import text_progress_meter
+    except ImportError, e: pass
+    else: kwargs['progress_obj'] = text_progress_meter()
+
+    def cfunc(filename, hello, there='foo'):
+        print hello, there
+        import random
+        rnum = random.random()
+        if rnum < .5:
+            print 'forcing retry'
+            raise URLGrabError(-1, 'forcing retry')
+        if rnum < .75:
+            print 'forcing failure'
+            raise URLGrabError(-2, 'forcing immediate failure')
+        print 'success'
+        return
+        
+    kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
+    try: name = apply(retrygrab, (url, filename), kwargs)
+    except URLGrabError, e: print e
+    else: print 'LOCAL FILE:', name
+
+def _file_object_test(filename=None):
+    import random, cStringIO, sys
+    if filename is None:
+        filename = __file__
+    print 'using file "%s" for comparisons' % filename
+    fo = open(filename)
+    s_input = fo.read()
+    fo.close()
+
+    for testfunc in [_test_file_object_smallread,
+                     _test_file_object_readall,
+                     _test_file_object_readline,
+                     _test_file_object_readlines]:
+        fo_input = cStringIO.StringIO(s_input)
+        fo_output = cStringIO.StringIO()
+        wrapper = PyCurlFileObject(fo_input, None, 0)
+        print 'testing %-30s ' % testfunc.__name__,
+        testfunc(wrapper, fo_output)
+        s_output = fo_output.getvalue()
+        if s_output == s_input: print 'passed'
+        else: print 'FAILED'
+            
+def _test_file_object_smallread(wrapper, fo_output):
+    while 1:
+        s = wrapper.read(23)
+        fo_output.write(s)
+        if not s: return
+
+def _test_file_object_readall(wrapper, fo_output):
+    s = wrapper.read()
+    fo_output.write(s)
+
+def _test_file_object_readline(wrapper, fo_output):
+    while 1:
+        s = wrapper.readline()
+        fo_output.write(s)
+        if not s: return
+
+def _test_file_object_readlines(wrapper, fo_output):
+    li = wrapper.readlines()
+    fo_output.write(string.join(li, ''))
+
+if __name__ == '__main__':
+    _main_test()
+    _retry_test()
+    _file_object_test('test')
diff --git a/naoki/urlgrabber/keepalive.py b/naoki/urlgrabber/keepalive.py

new file mode 100644 (file)

index 0000000..89ee97d
--- /dev/null
+++ b/naoki/urlgrabber/keepalive.py
@@ -0,0 +1,621 @@
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License as published by the Free Software Foundation; either
+#   version 2.1 of the License, or (at your option) any later version.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, write to the 
+#      Free Software Foundation, Inc., 
+#      59 Temple Place, Suite 330, 
+#      Boston, MA  02111-1307  USA
+
+# This file is part of urlgrabber, a high-level cross-protocol url-grabber
+# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+
+"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
+
+>>> import urllib2
+>>> from keepalive import HTTPHandler
+>>> keepalive_handler = HTTPHandler()
+>>> opener = urllib2.build_opener(keepalive_handler)
+>>> urllib2.install_opener(opener)
+>>> 
+>>> fo = urllib2.urlopen('http://www.python.org')
+
+If a connection to a given host is requested, and all of the existing
+connections are still in use, another connection will be opened.  If
+the handler tries to use an existing connection but it fails in some
+way, it will be closed and removed from the pool.
+
+To remove the handler, simply re-run build_opener with no arguments, and
+install that opener.
+
+You can explicitly close connections by using the close_connection()
+method of the returned file-like object (described below) or you can
+use the handler methods:
+
+  close_connection(host)
+  close_all()
+  open_connections()
+
+NOTE: using the close_connection and close_all methods of the handler
+should be done with care when using multiple threads.
+  * there is nothing that prevents another thread from creating new
+    connections immediately after connections are closed
+  * no checks are done to prevent in-use connections from being closed
+
+>>> keepalive_handler.close_all()
+
+EXTRA ATTRIBUTES AND METHODS
+
+  Upon a status of 200, the object returned has a few additional
+  attributes and methods, which should not be used if you want to
+  remain consistent with the normal urllib2-returned objects:
+
+    close_connection()  -  close the connection to the host
+    readlines()         -  you know, readlines()
+    status              -  the return status (ie 404)
+    reason              -  english translation of status (ie 'File not found')
+
+  If you want the best of both worlds, use this inside an
+  AttributeError-catching try:
+
+  >>> try: status = fo.status
+  >>> except AttributeError: status = None
+
+  Unfortunately, these are ONLY there if status == 200, so it's not
+  easy to distinguish between non-200 responses.  The reason is that
+  urllib2 tries to do clever things with error codes 301, 302, 401,
+  and 407, and it wraps the object upon return.
+
+  For python versions earlier than 2.4, you can avoid this fancy error
+  handling by setting the module-level global HANDLE_ERRORS to zero.
+  You see, prior to 2.4, it's the HTTP Handler's job to determine what
+  to handle specially, and what to just pass up.  HANDLE_ERRORS == 0
+  means "pass everything up".  In python 2.4, however, this job no
+  longer belongs to the HTTP Handler and is now done by a NEW handler,
+  HTTPErrorProcessor.  Here's the bottom line:
+
+    python version < 2.4
+        HANDLE_ERRORS == 1  (default) pass up 200, treat the rest as
+                            errors
+        HANDLE_ERRORS == 0  pass everything up, error processing is
+                            left to the calling code
+    python version >= 2.4
+        HANDLE_ERRORS == 1  pass up 200, treat the rest as errors
+        HANDLE_ERRORS == 0  (default) pass everything up, let the
+                            other handlers (specifically,
+                            HTTPErrorProcessor) decide what to do
+
+  In practice, setting the variable either way makes little difference
+  in python 2.4, so for the most consistent behavior across versions,
+  you probably just want to use the defaults, which will give you
+  exceptions on errors.
+
+"""
+
+# $Id: keepalive.py,v 1.17 2006/12/08 00:14:16 mstenner Exp $
+
+import urllib2
+import httplib
+import socket
+import thread
+
+DEBUG = None
+
+import sslfactory
+
+import sys
+if sys.version_info < (2, 4): HANDLE_ERRORS = 1
+else: HANDLE_ERRORS = 0
+    
+class ConnectionManager:
+    """
+    The connection manager must be able to:
+      * keep track of all existing
+      """
+    def __init__(self):
+        self._lock = thread.allocate_lock()
+        self._hostmap = {} # map hosts to a list of connections
+        self._connmap = {} # map connections to host
+        self._readymap = {} # map connection to ready state
+
+    def add(self, host, connection, ready):
+        self._lock.acquire()
+        try:
+            if not self._hostmap.has_key(host): self._hostmap[host] = []
+            self._hostmap[host].append(connection)
+            self._connmap[connection] = host
+            self._readymap[connection] = ready
+        finally:
+            self._lock.release()
+
+    def remove(self, connection):
+        self._lock.acquire()
+        try:
+            try:
+                host = self._connmap[connection]
+            except KeyError:
+                pass
+            else:
+                del self._connmap[connection]
+                del self._readymap[connection]
+                self._hostmap[host].remove(connection)
+                if not self._hostmap[host]: del self._hostmap[host]
+        finally:
+            self._lock.release()
+
+    def set_ready(self, connection, ready):
+        try: self._readymap[connection] = ready
+        except KeyError: pass
+        
+    def get_ready_conn(self, host):
+        conn = None
+        self._lock.acquire()
+        try:
+            if self._hostmap.has_key(host):
+                for c in self._hostmap[host]:
+                    if self._readymap[c]:
+                        self._readymap[c] = 0
+                        conn = c
+                        break
+        finally:
+            self._lock.release()
+        return conn
+
+    def get_all(self, host=None):
+        if host:
+            return list(self._hostmap.get(host, []))
+        else:
+            return dict(self._hostmap)
+
+class KeepAliveHandler:
+    def __init__(self):
+        self._cm = ConnectionManager()
+        
+    #### Connection Management
+    def open_connections(self):
+        """return a list of connected hosts and the number of connections
+        to each.  [('foo.com:80', 2), ('bar.org', 1)]"""
+        return [(host, len(li)) for (host, li) in self._cm.get_all().items()]
+
+    def close_connection(self, host):
+        """close connection(s) to <host>
+        host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
+        no error occurs if there is no connection to that host."""
+        for h in self._cm.get_all(host):
+            self._cm.remove(h)
+            h.close()
+        
+    def close_all(self):
+        """close all open connections"""
+        for host, conns in self._cm.get_all().items():
+            for h in conns:
+                self._cm.remove(h)
+                h.close()
+        
+    def _request_closed(self, request, host, connection):
+        """tells us that this request is now closed and the the
+        connection is ready for another request"""
+        self._cm.set_ready(connection, 1)
+
+    def _remove_connection(self, host, connection, close=0):
+        if close: connection.close()
+        self._cm.remove(connection)
+        
+    #### Transaction Execution
+    def do_open(self, req):
+        host = req.get_host()
+        if not host:
+            raise urllib2.URLError('no host given')
+
+        try:
+            h = self._cm.get_ready_conn(host)
+            while h:
+                r = self._reuse_connection(h, req, host)
+
+                # if this response is non-None, then it worked and we're
+                # done.  Break out, skipping the else block.
+                if r: break
+
+                # connection is bad - possibly closed by server
+                # discard it and ask for the next free connection
+                h.close()
+                self._cm.remove(h)
+                h = self._cm.get_ready_conn(host)
+            else:
+                # no (working) free connections were found.  Create a new one.
+                h = self._get_connection(host)
+                if DEBUG: DEBUG.info("creating new connection to %s (%d)",
+                                     host, id(h))
+                self._cm.add(host, h, 0)
+                self._start_transaction(h, req)
+                r = h.getresponse()
+        except (socket.error, httplib.HTTPException), err:
+            raise urllib2.URLError(err)
+            
+        if DEBUG: DEBUG.info("STATUS: %s, %s", r.status, r.reason)
+
+        # if not a persistent connection, don't try to reuse it
+        if r.will_close:
+            if DEBUG: DEBUG.info('server will close connection, discarding')
+            self._cm.remove(h)
+
+        r._handler = self
+        r._host = host
+        r._url = req.get_full_url()
+        r._connection = h
+        r.code = r.status
+        r.headers = r.msg
+        r.msg = r.reason
+        
+        if r.status == 200 or not HANDLE_ERRORS:
+            return r
+        else:
+            return self.parent.error('http', req, r,
+                                     r.status, r.msg, r.headers)
+
+    def _reuse_connection(self, h, req, host):
+        """start the transaction with a re-used connection
+        return a response object (r) upon success or None on failure.
+        This DOES not close or remove bad connections in cases where
+        it returns.  However, if an unexpected exception occurs, it
+        will close and remove the connection before re-raising.
+        """
+        try:
+            self._start_transaction(h, req)
+            r = h.getresponse()
+            # note: just because we got something back doesn't mean it
+            # worked.  We'll check the version below, too.
+        except (socket.error, httplib.HTTPException):
+            r = None
+        except:
+            # adding this block just in case we've missed
+            # something we will still raise the exception, but
+            # lets try and close the connection and remove it
+            # first.  We previously got into a nasty loop
+            # where an exception was uncaught, and so the
+            # connection stayed open.  On the next try, the
+            # same exception was raised, etc.  The tradeoff is
+            # that it's now possible this call will raise
+            # a DIFFERENT exception
+            if DEBUG: DEBUG.error("unexpected exception - closing " + \
+                                  "connection to %s (%d)", host, id(h))
+            self._cm.remove(h)
+            h.close()
+            raise
+                    
+        if r is None or r.version == 9:
+            # httplib falls back to assuming HTTP 0.9 if it gets a
+            # bad header back.  This is most likely to happen if
+            # the socket has been closed by the server since we
+            # last used the connection.
+            if DEBUG: DEBUG.info("failed to re-use connection to %s (%d)",
+                                 host, id(h))
+            r = None
+        else:
+            if DEBUG: DEBUG.info("re-using connection to %s (%d)", host, id(h))
+
+        return r
+
+    def _start_transaction(self, h, req):
+        try:
+            if req.has_data():
+                data = req.get_data()
+                h.putrequest('POST', req.get_selector())
+                if not req.headers.has_key('Content-type'):
+                    h.putheader('Content-type',
+                                'application/x-www-form-urlencoded')
+                if not req.headers.has_key('Content-length'):
+                    h.putheader('Content-length', '%d' % len(data))
+            else:
+                h.putrequest('GET', req.get_selector())
+        except (socket.error, httplib.HTTPException), err:
+            raise urllib2.URLError(err)
+
+        for args in self.parent.addheaders:
+            h.putheader(*args)
+        for k, v in req.headers.items():
+            h.putheader(k, v)
+        h.endheaders()
+        if req.has_data():
+            h.send(data)
+
+    def _get_connection(self, host):
+        return NotImplementedError
+
+class HTTPHandler(KeepAliveHandler, urllib2.HTTPHandler):
+    def __init__(self):
+        KeepAliveHandler.__init__(self)
+
+    def http_open(self, req):
+        return self.do_open(req)
+
+    def _get_connection(self, host):
+        return HTTPConnection(host)
+
+class HTTPSHandler(KeepAliveHandler, urllib2.HTTPSHandler):
+    def __init__(self, ssl_factory=None):
+        KeepAliveHandler.__init__(self)
+        if not ssl_factory:
+            ssl_factory = sslfactory.get_factory()
+        self._ssl_factory = ssl_factory
+    
+    def https_open(self, req):
+        return self.do_open(req)
+
+    def _get_connection(self, host):
+        try: return self._ssl_factory.get_https_connection(host)
+        except AttributeError: return HTTPSConnection(host)
+        
+class HTTPResponse(httplib.HTTPResponse):
+    # we need to subclass HTTPResponse in order to
+    # 1) add readline() and readlines() methods
+    # 2) add close_connection() methods
+    # 3) add info() and geturl() methods
+
+    # in order to add readline(), read must be modified to deal with a
+    # buffer.  example: readline must read a buffer and then spit back
+    # one line at a time.  The only real alternative is to read one
+    # BYTE at a time (ick).  Once something has been read, it can't be
+    # put back (ok, maybe it can, but that's even uglier than this),
+    # so if you THEN do a normal read, you must first take stuff from
+    # the buffer.
+
+    # the read method wraps the original to accomodate buffering,
+    # although read() never adds to the buffer.
+    # Both readline and readlines have been stolen with almost no
+    # modification from socket.py
+    
+
+    def __init__(self, sock, debuglevel=0, strict=0, method=None):
+        if method: # the httplib in python 2.3 uses the method arg
+            httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
+        else: # 2.2 doesn't
+            httplib.HTTPResponse.__init__(self, sock, debuglevel)
+        self.fileno = sock.fileno
+        self.code = None
+        self._rbuf = ''
+        self._rbufsize = 8096
+        self._handler = None # inserted by the handler later
+        self._host = None    # (same)
+        self._url = None     # (same)
+        self._connection = None # (same)
+
+    _raw_read = httplib.HTTPResponse.read
+
+    def close(self):
+        if self.fp:
+            self.fp.close()
+            self.fp = None
+            if self._handler:
+                self._handler._request_closed(self, self._host,
+                                              self._connection)
+
+    def close_connection(self):
+        self._handler._remove_connection(self._host, self._connection, close=1)
+        self.close()
+        
+    def info(self):
+        return self.headers
+
+    def geturl(self):
+        return self._url
+
+    def read(self, amt=None):
+        # the _rbuf test is only in this first if for speed.  It's not
+        # logically necessary
+        if self._rbuf and not amt is None:
+            L = len(self._rbuf)
+            if amt > L:
+                amt -= L
+            else:
+                s = self._rbuf[:amt]
+                self._rbuf = self._rbuf[amt:]
+                return s
+
+        s = self._rbuf + self._raw_read(amt)
+        self._rbuf = ''
+        return s
+
+    def readline(self, limit=-1):
+        data = ""
+        i = self._rbuf.find('\n')
+        while i < 0 and not (0 < limit <= len(self._rbuf)):
+            new = self._raw_read(self._rbufsize)
+            if not new: break
+            i = new.find('\n')
+            if i >= 0: i = i + len(self._rbuf)
+            self._rbuf = self._rbuf + new
+        if i < 0: i = len(self._rbuf)
+        else: i = i+1
+        if 0 <= limit < len(self._rbuf): i = limit
+        data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
+        return data
+
+    def readlines(self, sizehint = 0):
+        total = 0
+        list = []
+        while 1:
+            line = self.readline()
+            if not line: break
+            list.append(line)
+            total += len(line)
+            if sizehint and total >= sizehint:
+                break
+        return list
+
+
+class HTTPConnection(httplib.HTTPConnection):
+    # use the modified response class
+    response_class = HTTPResponse
+
+class HTTPSConnection(httplib.HTTPSConnection):
+    response_class = HTTPResponse
+    
+#########################################################################
+#####   TEST FUNCTIONS
+#########################################################################
+
+def error_handler(url):
+    global HANDLE_ERRORS
+    orig = HANDLE_ERRORS
+    keepalive_handler = HTTPHandler()
+    opener = urllib2.build_opener(keepalive_handler)
+    urllib2.install_opener(opener)
+    pos = {0: 'off', 1: 'on'}
+    for i in (0, 1):
+        print "  fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
+        HANDLE_ERRORS = i
+        try:
+            fo = urllib2.urlopen(url)
+            foo = fo.read()
+            fo.close()
+            try: status, reason = fo.status, fo.reason
+            except AttributeError: status, reason = None, None
+        except IOError, e:
+            print "  EXCEPTION: %s" % e
+            raise
+        else:
+            print "  status = %s, reason = %s" % (status, reason)
+    HANDLE_ERRORS = orig
+    hosts = keepalive_handler.open_connections()
+    print "open connections:", hosts
+    keepalive_handler.close_all()
+
+def continuity(url):
+    import md5
+    format = '%25s: %s'
+    
+    # first fetch the file with the normal http handler
+    opener = urllib2.build_opener()
+    urllib2.install_opener(opener)
+    fo = urllib2.urlopen(url)
+    foo = fo.read()
+    fo.close()
+    m = md5.new(foo)
+    print format % ('normal urllib', m.hexdigest())
+
+    # now install the keepalive handler and try again
+    opener = urllib2.build_opener(HTTPHandler())
+    urllib2.install_opener(opener)
+
+    fo = urllib2.urlopen(url)
+    foo = fo.read()
+    fo.close()
+    m = md5.new(foo)
+    print format % ('keepalive read', m.hexdigest())
+
+    fo = urllib2.urlopen(url)
+    foo = ''
+    while 1:
+        f = fo.readline()
+        if f: foo = foo + f
+        else: break
+    fo.close()
+    m = md5.new(foo)
+    print format % ('keepalive readline', m.hexdigest())
+
+def comp(N, url):
+    print '  making %i connections to:\n  %s' % (N, url)
+
+    sys.stdout.write('  first using the normal urllib handlers')
+    # first use normal opener
+    opener = urllib2.build_opener()
+    urllib2.install_opener(opener)
+    t1 = fetch(N, url)
+    print '  TIME: %.3f s' % t1
+
+    sys.stdout.write('  now using the keepalive handler       ')
+    # now install the keepalive handler and try again
+    opener = urllib2.build_opener(HTTPHandler())
+    urllib2.install_opener(opener)
+    t2 = fetch(N, url)
+    print '  TIME: %.3f s' % t2
+    print '  improvement factor: %.2f' % (t1/t2, )
+    
+def fetch(N, url, delay=0):
+    import time
+    lens = []
+    starttime = time.time()
+    for i in range(N):
+        if delay and i > 0: time.sleep(delay)
+        fo = urllib2.urlopen(url)
+        foo = fo.read()
+        fo.close()
+        lens.append(len(foo))
+    diff = time.time() - starttime
+
+    j = 0
+    for i in lens[1:]:
+        j = j + 1
+        if not i == lens[0]:
+            print "WARNING: inconsistent length on read %i: %i" % (j, i)
+
+    return diff
+
+def test_timeout(url):
+    global DEBUG
+    dbbackup = DEBUG
+    class FakeLogger:
+        def debug(self, msg, *args): print msg % args
+        info = warning = error = debug
+    DEBUG = FakeLogger()
+    print "  fetching the file to establish a connection"
+    fo = urllib2.urlopen(url)
+    data1 = fo.read()
+    fo.close()
+ 
+    i = 20
+    print "  waiting %i seconds for the server to close the connection" % i
+    while i > 0:
+        sys.stdout.write('\r  %2i' % i)
+        sys.stdout.flush()
+        time.sleep(1)
+        i -= 1
+    sys.stderr.write('\r')
+
+    print "  fetching the file a second time"
+    fo = urllib2.urlopen(url)
+    data2 = fo.read()
+    fo.close()
+
+    if data1 == data2:
+        print '  data are identical'
+    else:
+        print '  ERROR: DATA DIFFER'
+
+    DEBUG = dbbackup
+
+    
+def test(url, N=10):
+    print "checking error hander (do this on a non-200)"
+    try: error_handler(url)
+    except IOError, e:
+        print "exiting - exception will prevent further tests"
+        sys.exit()
+    print
+    print "performing continuity test (making sure stuff isn't corrupted)"
+    continuity(url)
+    print
+    print "performing speed comparison"
+    comp(N, url)
+    print
+    print "performing dropped-connection check"
+    test_timeout(url)
+    
+if __name__ == '__main__':
+    import time
+    import sys
+    try:
+        N = int(sys.argv[1])
+        url = sys.argv[2]
+    except:
+        print "%s <integer> <url>" % sys.argv[0]
+    else:
+        test(url, N)
diff --git a/naoki/urlgrabber/mirror.py b/naoki/urlgrabber/mirror.py

new file mode 100644 (file)

index 0000000..9664c6b
--- /dev/null
+++ b/naoki/urlgrabber/mirror.py
@@ -0,0 +1,458 @@
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License as published by the Free Software Foundation; either
+#   version 2.1 of the License, or (at your option) any later version.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, write to the 
+#      Free Software Foundation, Inc., 
+#      59 Temple Place, Suite 330, 
+#      Boston, MA  02111-1307  USA
+
+# This file is part of urlgrabber, a high-level cross-protocol url-grabber
+# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+
+"""Module for downloading files from a pool of mirrors
+
+DESCRIPTION
+
+  This module provides support for downloading files from a pool of
+  mirrors with configurable failover policies.  To a large extent, the
+  failover policy is chosen by using different classes derived from
+  the main class, MirrorGroup.
+
+  Instances of MirrorGroup (and cousins) act very much like URLGrabber
+  instances in that they have urlread, urlgrab, and urlopen methods.
+  They can therefore, be used in very similar ways.
+
+    from urlgrabber.grabber import URLGrabber
+    from urlgrabber.mirror import MirrorGroup
+    gr = URLGrabber()
+    mg = MirrorGroup(gr, ['http://foo.com/some/directory/',
+                          'http://bar.org/maybe/somewhere/else/',
+                          'ftp://baz.net/some/other/place/entirely/']
+    mg.urlgrab('relative/path.zip')
+
+  The assumption is that all mirrors are identical AFTER the base urls
+  specified, so that any mirror can be used to fetch any file.
+
+FAILOVER
+
+  The failover mechanism is designed to be customized by subclassing
+  from MirrorGroup to change the details of the behavior.  In general,
+  the classes maintain a master mirror list and a "current mirror"
+  index.  When a download is initiated, a copy of this list and index
+  is created for that download only.  The specific failover policy
+  depends on the class used, and so is documented in the class
+  documentation.  Note that ANY behavior of the class can be
+  overridden, so any failover policy at all is possible (although
+  you may need to change the interface in extreme cases).
+
+CUSTOMIZATION
+
+  Most customization of a MirrorGroup object is done at instantiation
+  time (or via subclassing).  There are four major types of
+  customization:
+
+    1) Pass in a custom urlgrabber - The passed in urlgrabber will be
+       used (by default... see #2) for the grabs, so options to it
+       apply for the url-fetching
+
+    2) Custom mirror list - Mirror lists can simply be a list of
+       stings mirrors (as shown in the example above) but each can
+       also be a dict, allowing for more options.  For example, the
+       first mirror in the list above could also have been:
+
+         {'mirror': 'http://foo.com/some/directory/',
+          'grabber': <a custom grabber to be used for this mirror>,
+          'kwargs': { <a dict of arguments passed to the grabber> }}
+
+       All mirrors are converted to this format internally.  If
+       'grabber' is omitted, the default grabber will be used.  If
+       kwargs are omitted, then (duh) they will not be used.
+
+    3) Pass keyword arguments when instantiating the mirror group.
+       See, for example, the failure_callback argument.
+
+    4) Finally, any kwargs passed in for the specific file (to the
+       urlgrab method, for example) will be folded in.  The options
+       passed into the grabber's urlXXX methods will override any
+       options specified in a custom mirror dict.
+
+"""
+
+# $Id: mirror.py,v 1.14 2006/02/22 18:26:46 mstenner Exp $
+
+import random
+import thread  # needed for locking to make this threadsafe
+
+from grabber import URLGrabError, CallbackObject, DEBUG
+
+try:
+    from i18n import _
+except ImportError, msg:
+    def _(st): return st
+
+class GrabRequest:
+    """This is a dummy class used to hold information about the specific
+    request.  For example, a single file.  By maintaining this information
+    separately, we can accomplish two things:
+
+      1) make it a little easier to be threadsafe
+      2) have request-specific parameters
+    """
+    pass
+
+class MirrorGroup:
+    """Base Mirror class
+
+    Instances of this class are built with a grabber object and a list
+    of mirrors.  Then all calls to urlXXX should be passed relative urls.
+    The requested file will be searched for on the first mirror.  If the
+    grabber raises an exception (possibly after some retries) then that
+    mirror will be removed from the list, and the next will be attempted.
+    If all mirrors are exhausted, then an exception will be raised.
+
+    MirrorGroup has the following failover policy:
+
+      * downloads begin with the first mirror
+
+      * by default (see default_action below) a failure (after retries)
+        causes it to increment the local AND master indices.  Also,
+        the current mirror is removed from the local list (but NOT the
+        master list - the mirror can potentially be used for other
+        files)
+
+      * if the local list is ever exhausted, a URLGrabError will be
+        raised (errno=256, no more mirrors)
+
+    OPTIONS
+
+      In addition to the required arguments "grabber" and "mirrors",
+      MirrorGroup also takes the following optional arguments:
+      
+      default_action
+
+        A dict that describes the actions to be taken upon failure
+        (after retries).  default_action can contain any of the
+        following keys (shown here with their default values):
+
+          default_action = {'increment': 1,
+                            'increment_master': 1,
+                            'remove': 1,
+                            'remove_master': 0,
+                            'fail': 0}
+
+        In this context, 'increment' means "use the next mirror" and
+        'remove' means "never use this mirror again".  The two
+        'master' values refer to the instance-level mirror list (used
+        for all files), whereas the non-master values refer to the
+        current download only.
+
+        The 'fail' option will cause immediate failure by re-raising
+        the exception and no further attempts to get the current
+        download.
+
+        This dict can be set at instantiation time,
+          mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
+        at method-execution time (only applies to current fetch),
+          filename = mg.urlgrab(url, default_action={'increment': 0})
+        or by returning an action dict from the failure_callback
+          return {'fail':0}
+        in increasing precedence.
+        
+        If all three of these were done, the net result would be:
+              {'increment': 0,         # set in method
+               'increment_master': 1,  # class default
+               'remove': 1,            # class default
+               'remove_master': 0,     # class default
+               'fail': 0}              # set at instantiation, reset
+                                       # from callback
+
+      failure_callback
+
+        this is a callback that will be called when a mirror "fails",
+        meaning the grabber raises some URLGrabError.  If this is a
+        tuple, it is interpreted to be of the form (cb, args, kwargs)
+        where cb is the actual callable object (function, method,
+        etc).  Otherwise, it is assumed to be the callable object
+        itself.  The callback will be passed a grabber.CallbackObject
+        instance along with args and kwargs (if present).  The following
+        attributes are defined withing the instance:
+
+           obj.exception    = < exception that was raised >
+           obj.mirror       = < the mirror that was tried >
+           obj.relative_url = < url relative to the mirror >
+           obj.url          = < full url that failed >
+                              # .url is just the combination of .mirror
+                              # and .relative_url
+
+        The failure callback can return an action dict, as described
+        above.
+
+        Like default_action, the failure_callback can be set at
+        instantiation time or when the urlXXX method is called.  In
+        the latter case, it applies only for that fetch.
+
+        The callback can re-raise the exception quite easily.  For
+        example, this is a perfectly adequate callback function:
+
+          def callback(obj): raise obj.exception
+
+        WARNING: do not save the exception object (or the
+        CallbackObject instance).  As they contain stack frame
+        references, they can lead to circular references.
+
+    Notes:
+      * The behavior can be customized by deriving and overriding the
+        'CONFIGURATION METHODS'
+      * The 'grabber' instance is kept as a reference, not copied.
+        Therefore, the grabber instance can be modified externally
+        and changes will take effect immediately.
+    """
+
+    # notes on thread-safety:
+
+    #   A GrabRequest should never be shared by multiple threads because
+    #   it's never saved inside the MG object and never returned outside it.
+    #   therefore, it should be safe to access/modify grabrequest data
+    #   without a lock.  However, accessing the mirrors and _next attributes
+    #   of the MG itself must be done when locked to prevent (for example)
+    #   removal of the wrong mirror.
+
+    ##############################################################
+    #  CONFIGURATION METHODS  -  intended to be overridden to
+    #                            customize behavior
+    def __init__(self, grabber, mirrors, **kwargs):
+        """Initialize the MirrorGroup object.
+
+        REQUIRED ARGUMENTS
+
+          grabber  - URLGrabber instance
+          mirrors  - a list of mirrors
+
+        OPTIONAL ARGUMENTS
+
+          failure_callback  - callback to be used when a mirror fails
+          default_action    - dict of failure actions
+
+        See the module-level and class level documentation for more
+        details.
+        """
+
+        # OVERRIDE IDEAS:
+        #   shuffle the list to randomize order
+        self.grabber = grabber
+        self.mirrors = self._parse_mirrors(mirrors)
+        self._next = 0
+        self._lock = thread.allocate_lock()
+        self.default_action = None
+        self._process_kwargs(kwargs)
+
+    # if these values are found in **kwargs passed to one of the urlXXX
+    # methods, they will be stripped before getting passed on to the
+    # grabber
+    options = ['default_action', 'failure_callback']
+    
+    def _process_kwargs(self, kwargs):
+        self.failure_callback = kwargs.get('failure_callback')
+        self.default_action   = kwargs.get('default_action')
+       
+    def _parse_mirrors(self, mirrors):
+        parsed_mirrors = []
+        for m in mirrors:
+            if type(m) == type(''): m = {'mirror': m}
+            parsed_mirrors.append(m)
+        return parsed_mirrors
+    
+    def _load_gr(self, gr):
+        # OVERRIDE IDEAS:
+        #   shuffle gr list
+        self._lock.acquire()
+        gr.mirrors = list(self.mirrors)
+        gr._next = self._next
+        self._lock.release()
+
+    def _get_mirror(self, gr):
+        # OVERRIDE IDEAS:
+        #   return a random mirror so that multiple mirrors get used
+        #   even without failures.
+        if not gr.mirrors:
+            raise URLGrabError(256, _('No more mirrors to try.'))
+        return gr.mirrors[gr._next]
+
+    def _failure(self, gr, cb_obj):
+        # OVERRIDE IDEAS:
+        #   inspect the error - remove=1 for 404, remove=2 for connection
+        #                       refused, etc. (this can also be done via
+        #                       the callback)
+        cb = gr.kw.get('failure_callback') or self.failure_callback
+        if cb:
+            if type(cb) == type( () ):
+                cb, args, kwargs = cb
+            else:
+                args, kwargs = (), {}
+            action = cb(cb_obj, *args, **kwargs) or {}
+        else:
+            action = {}
+        # XXXX - decide - there are two ways to do this
+        # the first is action-overriding as a whole - use the entire action
+        # or fall back on module level defaults
+        #action = action or gr.kw.get('default_action') or self.default_action
+        # the other is to fall through for each element in the action dict
+        a = dict(self.default_action or {})
+        a.update(gr.kw.get('default_action', {}))
+        a.update(action)
+        action = a
+        self.increment_mirror(gr, action)
+        if action and action.get('fail', 0): raise
+
+    def increment_mirror(self, gr, action={}):
+        """Tell the mirror object increment the mirror index
+
+        This increments the mirror index, which amounts to telling the
+        mirror object to use a different mirror (for this and future
+        downloads).
+
+        This is a SEMI-public method.  It will be called internally,
+        and you may never need to call it.  However, it is provided
+        (and is made public) so that the calling program can increment
+        the mirror choice for methods like urlopen.  For example, with
+        urlopen, there's no good way for the mirror group to know that
+        an error occurs mid-download (it's already returned and given
+        you the file object).
+        
+        remove  ---  can have several values
+           0   do not remove the mirror from the list
+           1   remove the mirror for this download only
+           2   remove the mirror permanently
+
+        beware of remove=0 as it can lead to infinite loops
+        """
+        badmirror = gr.mirrors[gr._next]
+
+        self._lock.acquire()
+        try:
+            ind = self.mirrors.index(badmirror)
+        except ValueError:
+            pass
+        else:
+            if action.get('remove_master', 0):
+                del self.mirrors[ind]
+            elif self._next == ind and action.get('increment_master', 1):
+                self._next += 1
+            if self._next >= len(self.mirrors): self._next = 0
+        self._lock.release()
+        
+        if action.get('remove', 1):
+            del gr.mirrors[gr._next]
+        elif action.get('increment', 1):
+            gr._next += 1
+        if gr._next >= len(gr.mirrors): gr._next = 0
+
+        if DEBUG:
+            grm = [m['mirror'] for m in gr.mirrors]
+            DEBUG.info('GR   mirrors: [%s] %i', ' '.join(grm), gr._next)
+            selfm = [m['mirror'] for m in self.mirrors]
+            DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
+
+    #####################################################################
+    # NON-CONFIGURATION METHODS
+    # these methods are designed to be largely workhorse methods that
+    # are not intended to be overridden.  That doesn't mean you can't;
+    # if you want to, feel free, but most things can be done by
+    # by overriding the configuration methods :)
+
+    def _join_url(self, base_url, rel_url):
+        if base_url.endswith('/') or rel_url.startswith('/'):
+            return base_url + rel_url
+        else:
+            return base_url + '/' + rel_url
+        
+    def _mirror_try(self, func, url, kw):
+        gr = GrabRequest()
+        gr.func = func
+        gr.url  = url
+        gr.kw   = dict(kw)
+        self._load_gr(gr)
+
+        for k in self.options:
+            try: del kw[k]
+            except KeyError: pass
+
+        while 1:
+            mirrorchoice = self._get_mirror(gr)
+            fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
+            kwargs = dict(mirrorchoice.get('kwargs', {}))
+            kwargs.update(kw)
+            grabber = mirrorchoice.get('grabber') or self.grabber
+            func_ref = getattr(grabber, func)
+            if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
+            try:
+                return func_ref( *(fullurl,), **kwargs )
+            except URLGrabError, e:
+                if DEBUG: DEBUG.info('MIRROR: failed')
+                obj = CallbackObject()
+                obj.exception = e
+                obj.mirror = mirrorchoice['mirror']
+                obj.relative_url = gr.url
+                obj.url = fullurl
+                self._failure(gr, obj)
+
+    def urlgrab(self, url, filename=None, **kwargs):
+        kw = dict(kwargs)
+        kw['filename'] = filename
+        func = 'urlgrab'
+        return self._mirror_try(func, url, kw)
+    
+    def urlopen(self, url, **kwargs):
+        kw = dict(kwargs)
+        func = 'urlopen'
+        return self._mirror_try(func, url, kw)
+
+    def urlread(self, url, limit=None, **kwargs):
+        kw = dict(kwargs)
+        kw['limit'] = limit
+        func = 'urlread'
+        return self._mirror_try(func, url, kw)
+            
+
+class MGRandomStart(MirrorGroup):
+    """A mirror group that starts at a random mirror in the list.
+
+    This behavior of this class is identical to MirrorGroup, except that
+    it starts at a random location in the mirror list.
+    """
+
+    def __init__(self, grabber, mirrors, **kwargs):
+        """Initialize the object
+
+        The arguments for intialization are the same as for MirrorGroup
+        """
+        MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
+        self._next = random.randrange(len(mirrors))
+
+class MGRandomOrder(MirrorGroup):
+    """A mirror group that uses mirrors in a random order.
+
+    This behavior of this class is identical to MirrorGroup, except that
+    it uses the mirrors in a random order.  Note that the order is set at
+    initialization time and fixed thereafter.  That is, it does not pick a
+    random mirror after each failure.
+    """
+
+    def __init__(self, grabber, mirrors, **kwargs):
+        """Initialize the object
+
+        The arguments for intialization are the same as for MirrorGroup
+        """
+        MirrorGroup.__init__(self, grabber, mirrors, **kwargs)
+        random.shuffle(self.mirrors)
+
+if __name__ == '__main__':
+    pass
diff --git a/naoki/urlgrabber/progress.py b/naoki/urlgrabber/progress.py

new file mode 100644 (file)

index 0000000..7dd8d6a
--- /dev/null
+++ b/naoki/urlgrabber/progress.py
@@ -0,0 +1,757 @@
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License as published by the Free Software Foundation; either
+#   version 2.1 of the License, or (at your option) any later version.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, write to the 
+#      Free Software Foundation, Inc., 
+#      59 Temple Place, Suite 330, 
+#      Boston, MA  02111-1307  USA
+
+# This file is part of urlgrabber, a high-level cross-protocol url-grabber
+# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+
+# $Id: progress.py,v 1.7 2005/08/19 21:59:07 mstenner Exp $
+
+import sys
+import time
+import math
+import thread
+import types
+import fcntl
+import struct
+import termios
+
+# Code from http://mail.python.org/pipermail/python-list/2000-May/033365.html
+def terminal_width(fd=1):
+    """ Get the real terminal width """
+    try:
+        buf = 'abcdefgh'
+        buf = fcntl.ioctl(fd, termios.TIOCGWINSZ, buf)
+        ret = struct.unpack('hhhh', buf)[1]
+        if ret == 0:
+            return 80
+        # Add minimum too?
+        return ret
+    except: # IOError
+        return 80
+
+_term_width_val  = None
+_term_width_last = None
+def terminal_width_cached(fd=1, cache_timeout=1.000):
+    """ Get the real terminal width, but cache it for a bit. """
+    global _term_width_val
+    global _term_width_last
+
+    now = time.time()
+    if _term_width_val is None or (now - _term_width_last) > cache_timeout:
+        _term_width_val  = terminal_width(fd)
+        _term_width_last = now
+    return _term_width_val
+
+class TerminalLine:
+    """ Help create dynamic progress bars, uses terminal_width_cached(). """
+
+    def __init__(self, min_rest=0, beg_len=None, fd=1, cache_timeout=1.000):
+        if beg_len is None:
+            beg_len = min_rest
+        self._min_len = min_rest
+        self._llen    = terminal_width_cached(fd, cache_timeout)
+        if self._llen < beg_len:
+            self._llen = beg_len
+        self._fin = False
+
+    def __len__(self):
+        """ Usable length for elements. """
+        return self._llen - self._min_len
+
+    def rest_split(self, fixed, elements=2):
+        """ After a fixed length, split the rest of the line length among
+            a number of different elements (default=2). """
+        if self._llen < fixed:
+            return 0
+        return (self._llen - fixed) / elements
+
+    def add(self, element, full_len=None):
+        """ If there is room left in the line, above min_len, add element.
+            Note that as soon as one add fails all the rest will fail too. """
+
+        if full_len is None:
+            full_len = len(element)
+        if len(self) < full_len:
+            self._fin = True
+        if self._fin:
+            return ''
+
+        self._llen -= len(element)
+        return element
+
+    def rest(self):
+        """ Current rest of line, same as .rest_split(fixed=0, elements=1). """
+        return self._llen
+
+class BaseMeter:
+    def __init__(self):
+        self.update_period = 0.3 # seconds
+
+        self.filename   = None
+        self.url        = None
+        self.basename   = None
+        self.text       = None
+        self.size       = None
+        self.start_time = None
+        self.last_amount_read = 0
+        self.last_update_time = None
+        self.re = RateEstimator()
+        
+    def start(self, filename=None, url=None, basename=None,
+              size=None, now=None, text=None):
+        self.filename = filename
+        self.url      = url
+        self.basename = basename
+        self.text     = text
+
+        #size = None #########  TESTING
+        self.size = size
+        if not size is None: self.fsize = format_number(size) + 'B'
+
+        if now is None: now = time.time()
+        self.start_time = now
+        self.re.start(size, now)
+        self.last_amount_read = 0
+        self.last_update_time = now
+        self._do_start(now)
+        
+    def _do_start(self, now=None):
+        pass
+
+    def update(self, amount_read, now=None):
+        # for a real gui, you probably want to override and put a call
+        # to your mainloop iteration function here
+        if now is None: now = time.time()
+        if (now >= self.last_update_time + self.update_period) or \
+               not self.last_update_time:
+            self.re.update(amount_read, now)
+            self.last_amount_read = amount_read
+            self.last_update_time = now
+            self._do_update(amount_read, now)
+
+    def _do_update(self, amount_read, now=None):
+        pass
+
+    def end(self, amount_read, now=None):
+        if now is None: now = time.time()
+        self.re.update(amount_read, now)
+        self.last_amount_read = amount_read
+        self.last_update_time = now
+        self._do_end(amount_read, now)
+
+    def _do_end(self, amount_read, now=None):
+        pass
+        
+#  This is kind of a hack, but progress is gotten from grabber which doesn't
+# know about the total size to download. So we do this so we can get the data
+# out of band here. This will be "fixed" one way or anther soon.
+_text_meter_total_size = 0
+_text_meter_sofar_size = 0
+def text_meter_total_size(size, downloaded=0):
+    global _text_meter_total_size
+    global _text_meter_sofar_size
+    _text_meter_total_size = size
+    _text_meter_sofar_size = downloaded
+
+#
+#       update: No size (minimal: 17 chars)
+#       -----------------------------------
+# <text>                          <rate> | <current size> <elapsed time> 
+#  8-48                          1    8  3             6 1            9 5
+#
+# Order: 1. <text>+<current size> (17)
+#        2. +<elapsed time>       (10, total: 27)
+#        3. +                     ( 5, total: 32)
+#        4. +<rate>               ( 9, total: 41)
+#
+#       update: Size, Single file
+#       -------------------------
+# <text>            <pc>  <bar> <rate> | <current size> <eta time> ETA
+#  8-25            1 3-4 1 6-16 1   8  3             6 1        9 1  3 1
+#
+# Order: 1. <text>+<current size> (17)
+#        2. +<eta time>           (10, total: 27)
+#        3. +ETA                  ( 5, total: 32)
+#        4. +<pc>                 ( 4, total: 36)
+#        5. +<rate>               ( 9, total: 45)
+#        6. +<bar>                ( 7, total: 52)
+#
+#       update: Size, All files
+#       -----------------------
+# <text> <total pc> <pc>  <bar> <rate> | <current size> <eta time> ETA
+#  8-22 1      5-7 1 3-4 1 6-12 1   8  3             6 1        9 1  3 1
+#
+# Order: 1. <text>+<current size> (17)
+#        2. +<eta time>           (10, total: 27)
+#        3. +ETA                  ( 5, total: 32)
+#        4. +<total pc>           ( 5, total: 37)
+#        4. +<pc>                 ( 4, total: 41)
+#        5. +<rate>               ( 9, total: 50)
+#        6. +<bar>                ( 7, total: 57)
+#
+#       end
+#       ---
+# <text>                                 | <current size> <elapsed time> 
+#  8-56                                  3             6 1            9 5
+#
+# Order: 1. <text>                ( 8)
+#        2. +<current size>       ( 9, total: 17)
+#        3. +<elapsed time>       (10, total: 27)
+#        4. +                     ( 5, total: 32)
+#
+
+class TextMeter(BaseMeter):
+    def __init__(self, fo=sys.stderr):
+        BaseMeter.__init__(self)
+        self.fo = fo
+
+    def _do_update(self, amount_read, now=None):
+        etime = self.re.elapsed_time()
+        fetime = format_time(etime)
+        fread = format_number(amount_read)
+        #self.size = None
+        if self.text is not None:
+            text = self.text
+        else:
+            text = self.basename
+
+        ave_dl = format_number(self.re.average_rate())
+        sofar_size = None
+        if _text_meter_total_size:
+            sofar_size = _text_meter_sofar_size + amount_read
+            sofar_pc   = (sofar_size * 100) / _text_meter_total_size
+
+        # Include text + ui_rate in minimal
+        tl = TerminalLine(8, 8+1+8)
+        ui_size = tl.add(' | %5sB' % fread)
+        if self.size is None:
+            ui_time = tl.add(' %9s' % fetime)
+            ui_end  = tl.add(' ' * 5)
+            ui_rate = tl.add(' %5sB/s' % ave_dl)
+            out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+                                        ui_rate, ui_size, ui_time, ui_end)
+        else:
+            rtime = self.re.remaining_time()
+            frtime = format_time(rtime)
+            frac = self.re.fraction_read()
+
+            ui_time = tl.add(' %9s' % frtime)
+            ui_end  = tl.add(' ETA ')
+
+            if sofar_size is None:
+                ui_sofar_pc = ''
+            else:
+                ui_sofar_pc = tl.add(' (%i%%)' % sofar_pc,
+                                     full_len=len(" (100%)"))
+
+            ui_pc   = tl.add(' %2i%%' % (frac*100))
+            ui_rate = tl.add(' %5sB/s' % ave_dl)
+            # Make text grow a bit before we start growing the bar too
+            blen = 4 + tl.rest_split(8 + 8 + 4)
+            bar  = '='*int(blen * frac)
+            if (blen * frac) - int(blen * frac) >= 0.5:
+                bar += '-'
+            ui_bar  = tl.add(' [%-*.*s]' % (blen, blen, bar))
+            out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+                                              ui_sofar_pc, ui_pc, ui_bar,
+                                              ui_rate, ui_size, ui_time, ui_end)
+
+        self.fo.write(out)
+        self.fo.flush()
+
+    def _do_end(self, amount_read, now=None):
+        global _text_meter_total_size
+        global _text_meter_sofar_size
+
+        total_time = format_time(self.re.elapsed_time())
+        total_size = format_number(amount_read)
+        if self.text is not None:
+            text = self.text
+        else:
+            text = self.basename
+
+        tl = TerminalLine(8)
+        ui_size = tl.add(' | %5sB' % total_size)
+        ui_time = tl.add(' %9s' % total_time)
+        not_done = self.size is not None and amount_read != self.size
+        if not_done:
+            ui_end  = tl.add(' ... ')
+        else:
+            ui_end  = tl.add(' ' * 5)
+
+        out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
+                                    ui_size, ui_time, ui_end)
+        self.fo.write(out)
+        self.fo.flush()
+
+        # Don't add size to the sofar size until we have all of it.
+        # If we don't have a size, then just pretend/hope we got all of it.
+        if not_done:
+            return
+
+        if _text_meter_total_size:
+            _text_meter_sofar_size += amount_read
+        if _text_meter_total_size <= _text_meter_sofar_size:
+            _text_meter_total_size = 0
+            _text_meter_sofar_size = 0
+
+text_progress_meter = TextMeter
+
+class MultiFileHelper(BaseMeter):
+    def __init__(self, master):
+        BaseMeter.__init__(self)
+        self.master = master
+
+    def _do_start(self, now):
+        self.master.start_meter(self, now)
+
+    def _do_update(self, amount_read, now):
+        # elapsed time since last update
+        self.master.update_meter(self, now)
+
+    def _do_end(self, amount_read, now):
+        self.ftotal_time = format_time(now - self.start_time)
+        self.ftotal_size = format_number(self.last_amount_read)
+        self.master.end_meter(self, now)
+
+    def failure(self, message, now=None):
+        self.master.failure_meter(self, message, now)
+
+    def message(self, message):
+        self.master.message_meter(self, message)
+
+class MultiFileMeter:
+    helperclass = MultiFileHelper
+    def __init__(self):
+        self.meters = []
+        self.in_progress_meters = []
+        self._lock = thread.allocate_lock()
+        self.update_period = 0.3 # seconds
+        
+        self.numfiles         = None
+        self.finished_files   = 0
+        self.failed_files     = 0
+        self.open_files       = 0
+        self.total_size       = None
+        self.failed_size      = 0
+        self.start_time       = None
+        self.finished_file_size = 0
+        self.last_update_time = None
+        self.re = RateEstimator()
+
+    def start(self, numfiles=None, total_size=None, now=None):
+        if now is None: now = time.time()
+        self.numfiles         = numfiles
+        self.finished_files   = 0
+        self.failed_files     = 0
+        self.open_files       = 0
+        self.total_size       = total_size
+        self.failed_size      = 0
+        self.start_time       = now
+        self.finished_file_size = 0
+        self.last_update_time = now
+        self.re.start(total_size, now)
+        self._do_start(now)
+
+    def _do_start(self, now):
+        pass
+
+    def end(self, now=None):
+        if now is None: now = time.time()
+        self._do_end(now)
+        
+    def _do_end(self, now):
+        pass
+
+    def lock(self): self._lock.acquire()
+    def unlock(self): self._lock.release()
+
+    ###########################################################
+    # child meter creation and destruction
+    def newMeter(self):
+        newmeter = self.helperclass(self)
+        self.meters.append(newmeter)
+        return newmeter
+    
+    def removeMeter(self, meter):
+        self.meters.remove(meter)
+        
+    ###########################################################
+    # child functions - these should only be called by helpers
+    def start_meter(self, meter, now):
+        if not meter in self.meters:
+            raise ValueError('attempt to use orphaned meter')
+        self._lock.acquire()
+        try:
+            if not meter in self.in_progress_meters:
+                self.in_progress_meters.append(meter)
+                self.open_files += 1
+        finally:
+            self._lock.release()
+        self._do_start_meter(meter, now)
+        
+    def _do_start_meter(self, meter, now):
+        pass
+        
+    def update_meter(self, meter, now):
+        if not meter in self.meters:
+            raise ValueError('attempt to use orphaned meter')
+        if (now >= self.last_update_time + self.update_period) or \
+               not self.last_update_time:
+            self.re.update(self._amount_read(), now)
+            self.last_update_time = now
+            self._do_update_meter(meter, now)
+
+    def _do_update_meter(self, meter, now):
+        pass
+
+    def end_meter(self, meter, now):
+        if not meter in self.meters:
+            raise ValueError('attempt to use orphaned meter')
+        self._lock.acquire()
+        try:
+            try: self.in_progress_meters.remove(meter)
+            except ValueError: pass
+            self.open_files     -= 1
+            self.finished_files += 1
+            self.finished_file_size += meter.last_amount_read
+        finally:
+            self._lock.release()
+        self._do_end_meter(meter, now)
+
+    def _do_end_meter(self, meter, now):
+        pass
+
+    def failure_meter(self, meter, message, now):
+        if not meter in self.meters:
+            raise ValueError('attempt to use orphaned meter')
+        self._lock.acquire()
+        try:
+            try: self.in_progress_meters.remove(meter)
+            except ValueError: pass
+            self.open_files     -= 1
+            self.failed_files   += 1
+            if meter.size and self.failed_size is not None:
+                self.failed_size += meter.size
+            else:
+                self.failed_size = None
+        finally:
+            self._lock.release()
+        self._do_failure_meter(meter, message, now)
+
+    def _do_failure_meter(self, meter, message, now):
+        pass
+
+    def message_meter(self, meter, message):
+        pass
+
+    ########################################################
+    # internal functions
+    def _amount_read(self):
+        tot = self.finished_file_size
+        for m in self.in_progress_meters:
+            tot += m.last_amount_read
+        return tot
+
+
+class TextMultiFileMeter(MultiFileMeter):
+    def __init__(self, fo=sys.stderr):
+        self.fo = fo
+        MultiFileMeter.__init__(self)
+
+    # files: ###/### ###%  data: ######/###### ###%  time: ##:##:##/##:##:##
+    def _do_update_meter(self, meter, now):
+        self._lock.acquire()
+        try:
+            format = "files: %3i/%-3i %3i%%   data: %6.6s/%-6.6s %3i%%   " \
+                     "time: %8.8s/%8.8s"
+            df = self.finished_files
+            tf = self.numfiles or 1
+            pf = 100 * float(df)/tf + 0.49
+            dd = self.re.last_amount_read
+            td = self.total_size
+            pd = 100 * (self.re.fraction_read() or 0) + 0.49
+            dt = self.re.elapsed_time()
+            rt = self.re.remaining_time()
+            if rt is None: tt = None
+            else: tt = dt + rt
+
+            fdd = format_number(dd) + 'B'
+            ftd = format_number(td) + 'B'
+            fdt = format_time(dt, 1)
+            ftt = format_time(tt, 1)
+            
+            out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
+            self.fo.write('\r' + out)
+            self.fo.flush()
+        finally:
+            self._lock.release()
+
+    def _do_end_meter(self, meter, now):
+        self._lock.acquire()
+        try:
+            format = "%-30.30s %6.6s    %8.8s    %9.9s"
+            fn = meter.basename
+            size = meter.last_amount_read
+            fsize = format_number(size) + 'B'
+            et = meter.re.elapsed_time()
+            fet = format_time(et, 1)
+            frate = format_number(size / et) + 'B/s'
+            
+            out = '%-79.79s' % (format % (fn, fsize, fet, frate))
+            self.fo.write('\r' + out + '\n')
+        finally:
+            self._lock.release()
+        self._do_update_meter(meter, now)
+
+    def _do_failure_meter(self, meter, message, now):
+        self._lock.acquire()
+        try:
+            format = "%-30.30s %6.6s %s"
+            fn = meter.basename
+            if type(message) in (type(''), type(u'')):
+                message = message.splitlines()
+            if not message: message = ['']
+            out = '%-79s' % (format % (fn, 'FAILED', message[0] or ''))
+            self.fo.write('\r' + out + '\n')
+            for m in message[1:]: self.fo.write('  ' + m + '\n')
+            self._lock.release()
+        finally:
+            self._do_update_meter(meter, now)
+
+    def message_meter(self, meter, message):
+        self._lock.acquire()
+        try:
+            pass
+        finally:
+            self._lock.release()
+
+    def _do_end(self, now):
+        self._do_update_meter(None, now)
+        self._lock.acquire()
+        try:
+            self.fo.write('\n')
+            self.fo.flush()
+        finally:
+            self._lock.release()
+        
+######################################################################
+# support classes and functions
+
+class RateEstimator:
+    def __init__(self, timescale=5.0):
+        self.timescale = timescale
+
+    def start(self, total=None, now=None):
+        if now is None: now = time.time()
+        self.total = total
+        self.start_time = now
+        self.last_update_time = now
+        self.last_amount_read = 0
+        self.ave_rate = None
+        
+    def update(self, amount_read, now=None):
+        if now is None: now = time.time()
+        if amount_read == 0:
+            # if we just started this file, all bets are off
+            self.last_update_time = now
+            self.last_amount_read = 0
+            self.ave_rate = None
+            return
+
+        #print 'times', now, self.last_update_time
+        time_diff = now         - self.last_update_time
+        read_diff = amount_read - self.last_amount_read
+        # First update, on reget is the file size
+        if self.last_amount_read:
+            self.last_update_time = now
+            self.ave_rate = self._temporal_rolling_ave(\
+                time_diff, read_diff, self.ave_rate, self.timescale)
+        self.last_amount_read = amount_read
+        #print 'results', time_diff, read_diff, self.ave_rate
+        
+    #####################################################################
+    # result methods
+    def average_rate(self):
+        "get the average transfer rate (in bytes/second)"
+        return self.ave_rate
+
+    def elapsed_time(self):
+        "the time between the start of the transfer and the most recent update"
+        return self.last_update_time - self.start_time
+
+    def remaining_time(self):
+        "estimated time remaining"
+        if not self.ave_rate or not self.total: return None
+        return (self.total - self.last_amount_read) / self.ave_rate
+
+    def fraction_read(self):
+        """the fraction of the data that has been read
+        (can be None for unknown transfer size)"""
+        if self.total is None: return None
+        elif self.total == 0: return 1.0
+        else: return float(self.last_amount_read)/self.total
+
+    #########################################################################
+    # support methods
+    def _temporal_rolling_ave(self, time_diff, read_diff, last_ave, timescale):
+        """a temporal rolling average performs smooth averaging even when
+        updates come at irregular intervals.  This is performed by scaling
+        the "epsilon" according to the time since the last update.
+        Specifically, epsilon = time_diff / timescale
+
+        As a general rule, the average will take on a completely new value
+        after 'timescale' seconds."""
+        epsilon = time_diff / timescale
+        if epsilon > 1: epsilon = 1.0
+        return self._rolling_ave(time_diff, read_diff, last_ave, epsilon)
+    
+    def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon):
+        """perform a "rolling average" iteration
+        a rolling average "folds" new data into an existing average with
+        some weight, epsilon.  epsilon must be between 0.0 and 1.0 (inclusive)
+        a value of 0.0 means only the old value (initial value) counts,
+        and a value of 1.0 means only the newest value is considered."""
+        
+        try:
+            recent_rate = read_diff / time_diff
+        except ZeroDivisionError:
+            recent_rate = None
+        if last_ave is None: return recent_rate
+        elif recent_rate is None: return last_ave
+
+        # at this point, both last_ave and recent_rate are numbers
+        return epsilon * recent_rate  +  (1 - epsilon) * last_ave
+
+    def _round_remaining_time(self, rt, start_time=15.0):
+        """round the remaining time, depending on its size
+        If rt is between n*start_time and (n+1)*start_time round downward
+        to the nearest multiple of n (for any counting number n).
+        If rt < start_time, round down to the nearest 1.
+        For example (for start_time = 15.0):
+         2.7  -> 2.0
+         25.2 -> 25.0
+         26.4 -> 26.0
+         35.3 -> 34.0
+         63.6 -> 60.0
+        """
+
+        if rt < 0: return 0.0
+        shift = int(math.log(rt/start_time)/math.log(2))
+        rt = int(rt)
+        if shift <= 0: return rt
+        return float(int(rt) >> shift << shift)
+        
+
+def format_time(seconds, use_hours=0):
+    if seconds is None or seconds < 0:
+        if use_hours: return '--:--:--'
+        else:         return '--:--'
+    else:
+        seconds = int(seconds)
+        minutes = seconds / 60
+        seconds = seconds % 60
+        if use_hours:
+            hours = minutes / 60
+            minutes = minutes % 60
+            return '%02i:%02i:%02i' % (hours, minutes, seconds)
+        else:
+            return '%02i:%02i' % (minutes, seconds)
+            
+def format_number(number, SI=0, space=' '):
+    """Turn numbers into human-readable metric-like numbers"""
+    symbols = ['',  # (none)
+               'k', # kilo
+               'M', # mega
+               'G', # giga
+               'T', # tera
+               'P', # peta
+               'E', # exa
+               'Z', # zetta
+               'Y'] # yotta
+    
+    if SI: step = 1000.0
+    else: step = 1024.0
+
+    thresh = 999
+    depth = 0
+    max_depth = len(symbols) - 1
+    
+    # we want numbers between 0 and thresh, but don't exceed the length
+    # of our list.  In that event, the formatting will be screwed up,
+    # but it'll still show the right number.
+    while number > thresh and depth < max_depth:
+        depth  = depth + 1
+        number = number / step
+
+    if type(number) == type(1) or type(number) == type(1L):
+        # it's an int or a long, which means it didn't get divided,
+        # which means it's already short enough
+        format = '%i%s%s'
+    elif number < 9.95:
+        # must use 9.95 for proper sizing.  For example, 9.99 will be
+        # rounded to 10.0 with the .1f format string (which is too long)
+        format = '%.1f%s%s'
+    else:
+        format = '%.0f%s%s'
+        
+    return(format % (float(number or 0), space, symbols[depth]))
+
+def _tst(fn, cur, tot, beg, size, *args):
+    tm = TextMeter()
+    text = "(%d/%d): %s" % (cur, tot, fn)
+    tm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size, text=text)
+    num = beg
+    off = 0
+    for (inc, delay) in args:
+        off += 1
+        while num < ((size * off) / len(args)):
+            num += inc
+            tm.update(num)
+            time.sleep(delay)
+    tm.end(size)
+
+if __name__ == "__main__":
+    # (1/2): subversion-1.4.4-7.x86_64.rpm               2.4 MB /  85 kB/s    00:28     
+    # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm            924 kB / 106 kB/s    00:08     
+    if len(sys.argv) >= 2 and sys.argv[1] == 'total':
+        text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 +
+                              1000000 + 10000 + 10000 + 10000 + 1000000)
+    _tst("sm-1.0.0-1.fc8.i386.rpm", 1, 10, 0, 1000,
+         (10, 0.2), (10, 0.1), (100, 0.25))
+    _tst("s-1.0.1-1.fc8.i386.rpm", 2, 10, 0, 10000,
+         (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
+    _tst("m-1.0.1-2.fc8.i386.rpm", 3, 10, 5000, 10000,
+         (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
+    _tst("large-file-name-Foo-11.8.7-4.5.6.1.fc8.x86_64.rpm", 4, 10, 0, 1000000,
+         (1000, 0.2), (1000, 0.1), (10000, 0.1))
+    _tst("large-file-name-Foo2-11.8.7-4.5.6.2.fc8.x86_64.rpm", 5, 10,
+         500001, 1000000, (1000, 0.2), (1000, 0.1), (10000, 0.1))
+    _tst("large-file-name-Foo3-11.8.7-4.5.6.3.fc8.x86_64.rpm", 6, 10,
+         750002, 1000000, (1000, 0.2), (1000, 0.1), (10000, 0.1))
+    _tst("large-file-name-Foo4-10.8.7-4.5.6.1.fc8.x86_64.rpm", 7, 10, 0, 10000,
+         (100, 0.1))
+    _tst("large-file-name-Foo5-10.8.7-4.5.6.2.fc8.x86_64.rpm", 8, 10,
+         5001, 10000, (100, 0.1))
+    _tst("large-file-name-Foo6-10.8.7-4.5.6.3.fc8.x86_64.rpm", 9, 10,
+         7502, 10000, (1, 0.1))
+    _tst("large-file-name-Foox-9.8.7-4.5.6.1.fc8.x86_64.rpm",  10, 10,
+         0, 1000000, (10, 0.5),
+         (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
+         (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
+         (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
+         (100000, 0.1), (10000, 0.1), (10000, 0.1), (10000, 0.1),
+         (100000, 0.1), (1, 0.1))
diff --git a/naoki/urlgrabber/sslfactory.py b/naoki/urlgrabber/sslfactory.py

new file mode 100644 (file)

index 0000000..f7e6d3d
--- /dev/null
+++ b/naoki/urlgrabber/sslfactory.py
@@ -0,0 +1,89 @@
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License as published by the Free Software Foundation; either
+#   version 2.1 of the License, or (at your option) any later version.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, write to the 
+#      Free Software Foundation, Inc., 
+#      59 Temple Place, Suite 330, 
+#      Boston, MA  02111-1307  USA
+
+# This file is part of urlgrabber, a high-level cross-protocol url-grabber
+
+import httplib
+import urllib2
+
+try:
+    from M2Crypto import SSL
+    from M2Crypto import httpslib
+    from M2Crypto import m2urllib2
+
+    have_m2crypto = True
+except ImportError:
+    have_m2crypto = False
+
+DEBUG = None
+
+if have_m2crypto:
+    
+    class M2SSLFactory:
+
+        def __init__(self, ssl_ca_cert, ssl_context):
+            self.ssl_context = self._get_ssl_context(ssl_ca_cert, ssl_context)
+
+        def _get_ssl_context(self, ssl_ca_cert, ssl_context):
+            """
+            Create an ssl context using the CA cert file or ssl context.
+
+            The CA cert is used first if it was passed as an option. If not,
+            then the supplied ssl context is used. If no ssl context was supplied,
+            None is returned.
+            """
+            if ssl_ca_cert:
+                context = SSL.Context()
+                context.load_verify_locations(ssl_ca_cert)
+                context.set_verify(SSL.verify_peer, -1)
+                return context
+            else:
+                return ssl_context
+
+        def create_https_connection(self, host, response_class = None):
+            connection = httplib.HTTPSConnection(host, self.ssl_context)
+            if response_class:
+                connection.response_class = response_class
+            return connection
+
+        def create_opener(self, *handlers):
+            return m2urllib2.build_opener(self.ssl_context, *handlers)
+
+
+class SSLFactory:
+
+    def create_https_connection(self, host, response_class = None):
+        connection = httplib.HTTPSConnection(host)
+        if response_class:
+            connection.response_class = response_class
+        return connection
+
+    def create_opener(self, *handlers):
+        return urllib2.build_opener(*handlers)
+
+   
+
+def get_factory(ssl_ca_cert = None, ssl_context = None):
+    """ Return an SSLFactory, based on if M2Crypto is available. """
+    if have_m2crypto:
+        return M2SSLFactory(ssl_ca_cert, ssl_context)
+    else:
+        # Log here if someone provides the args but we don't use them.
+        if ssl_ca_cert or ssl_context:
+            if DEBUG:
+                DEBUG.warning("SSL arguments supplied, but M2Crypto is not available. "
+                        "Using Python SSL.")
+        return SSLFactory()
diff --git a/naoki/util.py b/naoki/util.py

new file mode 100644 (file)

index 0000000..caad3c4
--- /dev/null
+++ b/naoki/util.py
@@ -0,0 +1,206 @@
+#!/usr/bin/python
+
+import ctypes
+import fcntl
+import os
+import select
+import shutil
+import subprocess
+import sys
+import time
+
+from exception import *
+from logger import getLog
+
+_libc = ctypes.cdll.LoadLibrary(None)
+_errno = ctypes.c_int.in_dll(_libc, "errno")
+_libc.personality.argtypes = [ctypes.c_ulong]
+_libc.personality.restype = ctypes.c_int
+_libc.unshare.argtypes = [ctypes.c_int,]
+_libc.unshare.restype = ctypes.c_int
+CLONE_NEWNS = 0x00020000
+
+# taken from sys/personality.h
+PER_LINUX32=0x0008
+PER_LINUX=0x0000
+personality_defs = {
+       'x86_64': PER_LINUX,
+       'ppc64': PER_LINUX,
+       'sparc64': PER_LINUX,
+       'i386': PER_LINUX32,
+       'i586': PER_LINUX32,
+       'i686': PER_LINUX32,
+       'ppc': PER_LINUX32,
+       'sparc': PER_LINUX32,
+       'sparcv9': PER_LINUX32,
+       'ia64' : PER_LINUX,
+       'alpha' : PER_LINUX,
+       's390' : PER_LINUX32,
+       's390x' : PER_LINUX,
+}
+
+def touch(filename):
+       getLog().debug("touching file: %s" % filename)
+       f = open(filename, "w")
+       f.close()
+
+def mkdir(*args):
+       for dirName in args:
+               getLog().debug("ensuring that dir exists: %s" % dirName)
+               if not os.path.exists(dirName):
+                       try:
+                               getLog().debug("creating dir: %s" % dirName)
+                               os.makedirs(dirName)
+                       except OSError, e:
+                               getLog().exception("Could not create dir %s. Error: %s" % (dirName, e))
+                               raise Error, "Could not create dir %s. Error: %s" % (dirName, e)
+
+def rm(path, *args, **kargs):
+       """version os shutil.rmtree that ignores no-such-file-or-directory errors,
+               and tries harder if it finds immutable files"""
+       tryAgain = 1
+       failedFilename = None
+       getLog().debug("remove tree: %s" % path)
+       while tryAgain:
+               tryAgain = 0
+               try:
+                       shutil.rmtree(path, *args, **kargs)
+               except OSError, e:
+                       if e.errno == 2: # no such file or directory
+                               pass
+                       elif e.errno==1 or e.errno==13:
+                               tryAgain = 1
+                               if failedFilename == e.filename:
+                                       raise
+                               failedFilename = e.filename
+                               os.system("chattr -R -i %s" % path)
+                       else:
+                               raise
+
+def logOutput(fds, logger, returnOutput=1, start=0, timeout=0):
+       output=""
+       done = 0
+
+       # set all fds to nonblocking
+       for fd in fds:
+               flags = fcntl.fcntl(fd, fcntl.F_GETFL)
+               if not fd.closed:
+                       fcntl.fcntl(fd, fcntl.F_SETFL, flags| os.O_NONBLOCK)
+
+       tail = ""
+       while not done:
+               if (time.time() - start)>timeout and timeout!=0:
+                       done = 1
+                       break
+
+               i_rdy,o_rdy,e_rdy = select.select(fds,[],[],1) 
+               for s in i_rdy:
+                       # slurp as much input as is ready
+                       input = s.read()
+                       if input == "":
+                               done = 1
+                               break
+                       if logger is not None:
+                               lines = input.split("\n")
+                               if tail:
+                                       lines[0] = tail + lines[0]
+                               # we may not have all of the last line
+                               tail = lines.pop()
+                               for line in lines:
+                                       if line == '': continue
+                                       logger.debug(line)
+                               for h in logger.handlers:
+                                       h.flush()
+                       if returnOutput:
+                               output += input
+       if tail and logger is not None:
+               logger.debug(tail)
+       return output
+
+# these are called in child process, so no logging
+def condChroot(chrootPath):
+    if chrootPath is not None:
+        os.chdir(chrootPath)
+        os.chroot(chrootPath)
+
+def condChdir(cwd):
+    if cwd is not None:
+        os.chdir(cwd)
+
+def condPersonality(per=None):
+    if per is None or per in ('noarch',):
+        return
+    if personality_defs.get(per, None) is None:
+        return
+    res = _libc.personality(personality_defs[per])
+    if res == -1:
+        raise OSError(_errno.value, os.strerror(_errno.value))
+
+def do(command, shell=False, chrootPath=None, cwd=None, timeout=0, raiseExc=True, returnOutput=0, personality=None, *args, **kargs):
+       logger = kargs.get("logger", getLog())
+       output = ""
+       start = time.time()
+       env = kargs.get("env", None)
+       preexec = ChildPreExec(personality, chrootPath, cwd)
+       try:
+               child = None
+               logger.debug("Executing command: %s" % command)
+               child = subprocess.Popen(
+                       command, 
+                       shell=shell,
+                       bufsize=0, close_fds=True, 
+                       stdin=open("/dev/null", "r"), 
+                       stdout=subprocess.PIPE,
+                       stderr=subprocess.PIPE,
+                       preexec_fn = preexec,
+                       env=env
+               )
+               
+               # use select() to poll for output so we dont block
+               output = logOutput([child.stdout, child.stderr], 
+                               logger, returnOutput, start, timeout)
+
+       except:
+               # kill children if they arent done
+               if child is not None and child.returncode is None:
+                       os.killpg(child.pid, 9)
+               try:
+                       if child is not None:
+                               os.waitpid(child.pid, 0)
+               except:
+                       pass
+               raise
+
+       # wait until child is done, kill it if it passes timeout
+       niceExit=1
+       while child.poll() is None:
+               if (time.time() - start)>timeout and timeout!=0:
+                       niceExit=0
+                       os.killpg(child.pid, 15)
+               if (time.time() - start)>(timeout+1) and timeout!=0:
+                       niceExit=0
+                       os.killpg(child.pid, 9)
+
+       if not niceExit:
+               raise commandTimeoutExpired, ("Timeout(%s) expired for command:\n # %s\n%s" % (timeout, command, output))
+
+       logger.debug("Child returncode was: %s" % str(child.returncode))
+       if raiseExc and child.returncode:
+               if returnOutput:
+                       raise Error, ("Command failed: \n # %s\n%s" % (command, output), child.returncode)
+               else:
+                       raise Error, ("Command failed. See logs for output.\n # %s" % (command,), child.returncode)
+
+       return output
+
+class ChildPreExec(object):
+       def __init__(self, personality, chrootPath, cwd):
+               self.personality = personality
+               self.chrootPath  = chrootPath
+               self.cwd = cwd
+
+       def __call__(self, *args, **kargs):
+               os.setpgrp()
+               condPersonality(self.personality)
+               condChroot(self.chrootPath)
+               condChdir(self.cwd)
diff --git a/pkgs/Include b/pkgs/Include

index 8ea719c16cab38a84e6206bc070dfe10610f4f3d..c449892b71d5d07c060b69132fe4cfb48d7330bd 100644 (file)
--- a/pkgs/Include
+++ b/pkgs/Include
@@ -162,6 +162,7 @@ info:
         @echo "PKG_OBJECTS=\"$(strip $(OBJECTS))\""
         @echo "PKG_PACKAGES=\"$(PKG_PACKAGES)\""
         @echo "PKG_PACKAGES_FILES=\"$(PKG_PACKAGES_FILES)\""
+       @echo "PKG_PATCHES=\"$(PKG_PATCHES)\""
         @echo "PKG_VER=\"$(PKG_VER)\""
         @echo "PKG_REL=\"$(PKG_REL)\""
         @echo "PKG_SUMMARY=\"$(strip $(PKG_SUMMARY))\""
diff --git a/tools/decompressor b/tools/decompressor

new file mode 100755 (executable)

index 0000000..667d0c6
--- /dev/null
+++ b/tools/decompressor
@@ -0,0 +1,60 @@
+#!/bin/bash
+###############################################################################
+#                                                                             #
+# IPFire.org - A linux based firewall                                         #
+# Copyright (C) 2007, 2008, 2009 Michael Tremer & Christian Schmidt           #
+#                                                                             #
+# This program is free software: you can redistribute it and/or modify        #
+# it under the terms of the GNU General Public License as published by        #
+# the Free Software Foundation, either version 3 of the License, or           #
+# (at your option) any later version.                                         #
+#                                                                             #
+# This program is distributed in the hope that it will be useful,             #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of              #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
+# GNU General Public License for more details.                                #
+#                                                                             #
+# You should have received a copy of the GNU General Public License           #
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
+#                                                                             #
+###############################################################################
+
+FILES=
+ROOT=/
+
+while [ $# -gt 0 ]; do
+       case "$1" in
+               --root=*)
+                       ROOT=${1#--root=}
+                       ;;
+               *.ipk)
+                       file=$1
+                       [ ${file:0:1} != / ] && file="$(pwd)/$file"
+                       if [ -e "$file" ]; then
+                               FILES="$FILES $file"
+                       else
+                               echo "File does not exist: $file" >&2
+                               #exit 1
+                       fi
+                       ;;
+       esac
+       shift
+done
+
+if [ "$ROOT" != "/" ]; then
+       [ -d "$ROOT" ] || mkdir -p $ROOT
+       cd $ROOT
+       if [ "$(pwd)" != "$ROOT" ]; then
+               echo "Could not change to root: $ROOT" >&2
+               exit 1
+       fi
+fi
+
+# Sort all packages
+FILES=$(for i in $FILES; do echo $i; done | sort -u)
+
+for file in $FILES; do
+       echo "Extracting $file..."
+       cpio --extract --quiet -H newc --to-stdout data.img < $file | \
+               tar --extract --xz -C ${ROOT}
+done
author	Michael Tremer <michael.tremer@ipfire.org>
	Tue, 29 Dec 2009 17:00:27 +0000 (18:00 +0100)
committer	Michael Tremer <michael.tremer@ipfire.org>
	Tue, 29 Dec 2009 17:01:01 +0000 (18:01 +0100)
config/logging.ini	[new file with mode: 0644]	patch \| blob
config/naoki.conf	[new file with mode: 0644]	patch \| blob
make.sh		patch \| blob \| blame \| history
naoki/__init__.py	[new file with mode: 0644]	patch \| blob
naoki/chroot.py	[new file with mode: 0644]	patch \| blob
naoki/constants.py	[new file with mode: 0644]	patch \| blob
naoki/exception.py	[new file with mode: 0644]	patch \| blob
naoki/logger.py	[new file with mode: 0644]	patch \| blob
naoki/package.py	[new file with mode: 0644]	patch \| blob
naoki/urlgrabber/__init__.py	[new file with mode: 0644]	patch \| blob
naoki/urlgrabber/byterange.py	[new file with mode: 0644]	patch \| blob
naoki/urlgrabber/grabber.py	[new file with mode: 0644]	patch \| blob
naoki/urlgrabber/keepalive.py	[new file with mode: 0644]	patch \| blob
naoki/urlgrabber/mirror.py	[new file with mode: 0644]	patch \| blob
naoki/urlgrabber/progress.py	[new file with mode: 0644]	patch \| blob
naoki/urlgrabber/sslfactory.py	[new file with mode: 0644]	patch \| blob
naoki/util.py	[new file with mode: 0644]	patch \| blob
pkgs/Include		patch \| blob \| blame \| history
tools/decompressor	[new file with mode: 0755]	patch \| blob