Renamed urlopen.py to urllib.py.

author Guido van Rossum <guido@python.org>

Tue, 22 Mar 1994 12:05:32 +0000 (12:05 +0000)

committer Guido van Rossum <guido@python.org>

Tue, 22 Mar 1994 12:05:32 +0000 (12:05 +0000)
author Guido van Rossum <guido@python.org>
Tue, 22 Mar 1994 12:05:32 +0000 (12:05 +0000)
committer Guido van Rossum <guido@python.org>
Tue, 22 Mar 1994 12:05:32 +0000 (12:05 +0000)
diff --git a/Lib/urllib.py b/Lib/urllib.py

new file mode 100644 (file)

index 0000000..7350de6
--- /dev/null
+++ b/Lib/urllib.py
@@ -0,0 +1,454 @@
+# Open an arbitrary URL
+#
+# See the following document for a tentative description of URLs:
+#     Uniform Resource Locators              Tim Berners-Lee
+#     INTERNET DRAFT                                    CERN
+#     IETF URL Working Group                    14 July 1993
+#     draft-ietf-uri-url-01.txt
+#
+# The object returned by URLopener().open(file) will differ per
+# protocol.  All you know is that is has methods read(), readline(),
+# readlines(), fileno(), close() and info().  The read*(), fileno()
+# and close() methods work like those of open files. 
+# The info() method returns an rfc822.Message object which can be
+# used to query various info about the object, if available.
+# (rfc822.Message objects are queried with the getheader() method.)
+
+import socket
+import regex
+
+
+# This really consists of two pieces:
+# (1) a class which handles opening of all sorts of URLs
+#     (plus assorted utilities etc.)
+# (2) a set of functions for parsing URLs
+# XXX Should these be separated out into different modules?
+
+
+# Shortcut for basic usage
+_urlopener = None
+def urlopen(url):
+       global _urlopener
+       if not _urlopener:
+               _urlopener = URLopener()
+       return _urlopener.open(url)
+def urlretrieve(url):
+       global _urlopener
+       if not _urlopener:
+               _urlopener = URLopener()
+       return _urlopener.retrieve(url)
+def urlcleanup():
+       if _urlopener:
+               _urlopener.cleanup()
+
+
+# Class to open URLs.
+# This is a class rather than just a subroutine because we may need
+# more than one set of global protocol-specific options.
+ftpcache = {}
+class URLopener:
+
+       # Constructor
+       def __init__(self):
+               self.addheaders = []
+               self.tempcache = {}
+               self.ftpcache = ftpcache
+               # Undocumented feature: you can use a different
+               # ftp cache by assigning to the .ftpcache member;
+               # in case you want logically independent URL openers
+
+       def __del__(self):
+               self.close()
+
+       def close(self):
+               self.cleanup()
+
+       def cleanup(self):
+               import os
+               for url in self.tempcache.keys():
+                       try:
+                               os.unlink(self.tempcache[url][0])
+                       except os.error:
+                               pass
+                       del self.tempcache[url]
+
+       # Add a header to be used by the HTTP interface only
+       # e.g. u.addheader('Accept', 'sound/basic')
+       def addheader(self, *args):
+               self.addheaders.append(args)
+
+       # External interface
+       # Use URLopener().open(file) instead of open(file, 'r')
+       def open(self, url):
+               type, url = splittype(unwrap(url))
+               if not type: type = 'file'
+               name = 'open_' + type
+               if '-' in name:
+                       import regsub
+                       name = regsub.gsub('-', '_', name)
+               if not hasattr(self, name):
+                       raise IOError, ('url error', 'unknown url type', type)
+               try:
+                       return getattr(self, name)(url)
+               except socket.error, msg:
+                       raise IOError, ('socket error', msg)
+
+       # External interface
+       # retrieve(url) returns (filename, None) for a local object
+       # or (tempfilename, headers) for a remote object
+       def retrieve(self, url):
+               if self.tempcache.has_key(url):
+                       return self.tempcache[url]
+               url1 = unwrap(url)
+               if self.tempcache.has_key(url1):
+                       self.tempcache[url] = self.tempcache[url1]
+                       return self.tempcache[url1]
+               type, url1 = splittype(url1)
+               if not type or type == 'file':
+                       try:
+                               fp = self.open_local_file(url1)
+                               del fp
+                               return splithost(url1)[1], None
+                       except IOError, msg:
+                               pass
+               fp = self.open(url)
+               headers = fp.info()
+               import tempfile
+               tfn = tempfile.mktemp()
+               self.tempcache[url] = result = tfn, headers
+               tfp = open(tfn, 'w')
+               bs = 1024*8
+               block = fp.read(bs)
+               while block:
+                       tfp.write(block)
+                       block = fp.read(bs)
+               del fp
+               del tfp
+               return result
+
+       # Each method named open_<type> knows how to open that type of URL
+
+       # Use HTTP protocol
+       def open_http(self, url):
+               import httplib
+               host, selector = splithost(url)
+               h = httplib.HTTP(host)
+               h.putrequest('GET', selector)
+               for args in self.addheaders: apply(h.putheader, args)
+               errcode, errmsg, headers = h.getreply()
+               if errcode == 200: return addinfo(h.getfile(), headers)
+               else: raise IOError, ('http error', errcode, errmsg, headers)
+
+       # Use Gopher protocol
+       def open_gopher(self, url):
+               import gopherlib
+               host, selector = splithost(url)
+               type, selector = splitgophertype(selector)
+               selector, query = splitquery(selector)
+               if query: fp = gopherlib.send_query(selector, query, host)
+               else: fp = gopherlib.send_selector(selector, host)
+               return addinfo(fp, noheaders())
+
+       # Use local file or FTP depending on form of URL
+       def open_file(self, url):
+               try:
+                       return self.open_local_file(url)
+               except IOError:
+                       return self.open_ftp(url)
+
+       # Use local file
+       def open_local_file(self, url):
+               host, file = splithost(url)
+               if not host: return addinfo(open(file, 'r'), noheaders())
+               host, port = splitport(host)
+               if not port and socket.gethostbyname(host) in (
+                         localhost(), thishost()):
+                       return addinfo(open(file, 'r'), noheaders())
+               raise IOError, ('local file error', 'not on local host')
+
+       # Use FTP protocol
+       def open_ftp(self, url):
+               host, file = splithost(url)
+               if not host: raise IOError, ('ftp error', 'no host given')
+               host, port = splitport(host)
+               host = socket.gethostbyname(host)
+               if not port:
+                       import ftplib
+                       port = ftplib.FTP_PORT
+               key = (host, port)
+               try:
+                       if not self.ftpcache.has_key(key):
+                               self.ftpcache[key] = ftpwrapper(host, port)
+                       return addinfo(self.ftpcache[key].retrfile(file),
+                                 noheaders())
+               except ftperrors(), msg:
+                       raise IOError, ('ftp error', msg)
+
+
+# Utility functions
+
+# Return the IP address of the magic hostname 'localhost'
+_localhost = None
+def localhost():
+       global _localhost
+       if not _localhost:
+               _localhost = socket.gethostbyname('localhost')
+       return _localhost
+
+# Return the IP address of the current host
+_thishost = None
+def thishost():
+       global _thishost
+       if not _thishost:
+               _thishost = socket.gethostbyname(socket.gethostname())
+       return _thishost
+
+# Return the set of errors raised by the FTP class
+_ftperrors = None
+def ftperrors():
+       global _ftperrors
+       if not _ftperrors:
+               import ftplib
+               _ftperrors = (ftplib.error_reply,
+                             ftplib.error_temp,
+                             ftplib.error_perm,
+                             ftplib.error_proto)
+       return _ftperrors
+
+# Return an empty rfc822.Message object
+_noheaders = None
+def noheaders():
+       global _noheaders
+       if not _noheaders:
+               import rfc822
+               _noheaders = rfc822.Message(open('/dev/null', 'r'))
+               _noheaders.fp.close()   # Recycle file descriptor
+       return _noheaders
+
+
+# Utility classes
+
+# Class used by open_ftp() for cache of open FTP connections
+class ftpwrapper:
+       def __init__(self, host, port):
+               self.host = host
+               self.port = port
+               self.init()
+       def init(self):
+               import ftplib
+               self.ftp = ftplib.FTP()
+               self.ftp.connect(self.host, self.port)
+               self.ftp.login()
+       def retrfile(self, file):
+               import ftplib
+               try:
+                       self.ftp.voidcmd('TYPE I')
+               except ftplib.all_errors:
+                       self.init()
+                       self.ftp.voidcmd('TYPE I')
+               conn = None
+               if file:
+                       try:
+                               cmd = 'RETR ' + file
+                               conn = self.ftp.transfercmd(cmd)
+                       except ftplib.error_perm, reason:
+                               if reason[:3] != '550':
+                                       raise IOError, ('ftp error', reason)
+               if not conn:
+                       # Try a directory listing
+                       if file: cmd = 'LIST ' + file
+                       else: cmd = 'LIST'
+                       conn = self.ftp.transfercmd(cmd)
+               return addclosehook(conn.makefile('r'), self.ftp.voidresp)
+
+# Base class for addinfo and addclosehook
+class addbase:
+       def __init__(self, fp):
+               self.fp = fp
+               self.read = self.fp.read
+               self.readline = self.fp.readline
+               self.readlines = self.fp.readlines
+               self.fileno = self.fp.fileno
+       def __repr__(self):
+               return '<%s at %s whose fp = %s>' % (
+                         self.__class__.__name__, `id(self)`, `self.fp`)
+       def __del__(self):
+               self.close()
+       def close(self):
+               self.read = None
+               self.readline = None
+               self.readlines = None
+               self.fileno = None
+               self.fp = None
+
+# Class to add a close hook to an open file
+class addclosehook(addbase):
+       def __init__(self, fp, closehook, *hookargs):
+               addbase.__init__(self, fp)
+               self.closehook = closehook
+               self.hookargs = hookargs
+       def close(self):
+               if self.closehook:
+                       apply(self.closehook, self.hookargs)
+                       self.closehook = None
+                       self.hookargs = None
+               addbase.close(self)
+
+# class to add an info() method to an open file
+class addinfo(addbase):
+       def __init__(self, fp, headers):
+               addbase.__init__(self, fp)
+               self.headers = headers
+       def info(self):
+               return self.headers
+
+
+# Utility to combine a URL with a base URL to form a new URL
+
+def basejoin(base, url):
+       type, path = splittype(url)
+       if type: return url
+       host, path = splithost(path)
+       basetype, basepath = splittype(base)
+       basehost, basepath = splithost(basepath)
+       basepath, basetag = splittag(basepath)
+       basepath, basequery = splitquery(basepath)
+       type = basetype or 'file'
+       if path[:1] != '/':
+               import string
+               i = string.rfind(basepath, '/')
+               if i < 0: basepath = '/'
+               else: basepath = basepath[:i+1]
+               path = basepath + path
+       if not host: host = basehost
+       if host: return type + '://' + host + path
+       else: return type + ':' + path
+
+
+# Utilities to parse URLs:
+# unwrap('<URL:type//host/path>') --> 'type//host/path'
+# splittype('type:opaquestring') --> 'type', 'opaquestring'
+# splithost('//host[:port]/path') --> 'host[:port]', '/path'
+# splitport('host:port') --> 'host', 'port'
+# splitquery('/path?query') --> '/path', 'query'
+# splittag('/path#tag') --> '/path', 'tag'
+# splitgophertype('/Xselector') --> 'X', 'selector'
+# unquote('abc%20def') -> 'abc def'
+# quote('abc def') -> 'abc%20def')
+
+def unwrap(url):
+       import string
+       url = string.strip(url)
+       if url[:1] == '<' and url[-1:] == '>':
+               url = string.strip(url[1:-1])
+       if url[:4] == 'URL:': url = string.strip(url[4:])
+       return url
+
+_typeprog = regex.compile('^\([^/:]+\):\(.*\)$')
+def splittype(url):
+       if _typeprog.match(url) >= 0: return _typeprog.group(1, 2)
+       return None, url
+
+_hostprog = regex.compile('^//\([^/]+\)\(.*\)$')
+def splithost(url):
+       if _hostprog.match(url) >= 0: return _hostprog.group(1, 2)
+       return None, url
+
+_portprog = regex.compile('^\(.*\):\([0-9]+\)$')
+def splitport(host):
+       if _portprog.match(host) >= 0: return _portprog.group(1, 2)
+       return host, None
+
+_queryprog = regex.compile('^\(.*\)\?\([^?]*\)$')
+def splitquery(url):
+       if _queryprog.match(url) >= 0: return _queryprog.group(1, 2)
+       return url, None
+
+_tagprog = regex.compile('^\(.*\)#\([^#]*\)$')
+def splittag(url):
+       if _tagprog.match(url) >= 0: return _tagprog.group(1, 2)
+       return url, None
+
+def splitgophertype(selector):
+       if selector[:1] == '/' and selector[1:2]:
+               return selector[1], selector[2:]
+       return None, selector
+
+_quoteprog = regex.compile('%[0-9a-fA-F][0-9a-fA-F]')
+def unquote(s):
+       import string
+       i = 0
+       n = len(s)
+       res = ''
+       while 0 <= i < n:
+               j = _quoteprog.search(s, i)
+               if j < 0:
+                       res = res + s[i:]
+                       break
+               res = res + (s[i:j] + chr(eval('0x' + s[j+1:j+3])))
+               i = j+3
+       return res
+
+_acceptable = \
+         'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._@'
+def quote(s):
+       res = ''
+       for c in s:
+               if c in _acceptable: res = res + c
+               else: res = res + '%%%02x' % ord(c)
+       return res
+
+# Test and time quote() and unquote()
+def test1():
+       import time
+       s = ''
+       for i in range(256): s = s + chr(i)
+       s = s*4
+       t0 = time.time()
+       qs = quote(s)
+       uqs = unquote(qs)
+       t1 = time.time()
+       if uqs != s:
+               print 'Wrong!'
+       print `s`
+       print `qs`
+       print `uqs`
+       print round(t1 - t0, 3), 'sec'
+
+
+# Test program
+def test():
+       import sys
+       import regsub
+       args = sys.argv[1:]
+       if not args:
+               args = [
+                       '/etc/passwd',
+                       'file:/etc/passwd',
+                       'file://localhost/etc/passwd',
+                       'ftp://ftp.cwi.nl/etc/passwd',
+                       'gopher://gopher.cwi.nl/11/',
+                       'http://www.cwi.nl/index.html',
+                       ]
+       try:
+               for url in args:
+                       print '-'*10, url, '-'*10
+                       fn, h = urlretrieve(url)
+                       print fn, h
+                       if h:
+                               print '======'
+                               for k in h.keys(): print k + ':', h[k]
+                               print '======'
+                       fp = open(fn, 'r')
+                       data = fp.read()
+                       del fp
+                       print regsub.gsub('\r', '', data)
+                       fn, h = None, None
+               print '-'*40
+       finally:
+               urlcleanup()
+
+# Run test program when run as a script
+if __name__ == '__main__':
+       test1()
+       test()
author	Guido van Rossum <guido@python.org>
	Tue, 22 Mar 1994 12:05:32 +0000 (12:05 +0000)
committer	Guido van Rossum <guido@python.org>
	Tue, 22 Mar 1994 12:05:32 +0000 (12:05 +0000)