[people/stevee/pakfire.git] / python / pakfire / downloader.py

#!/usr/bin/python
###############################################################################
#                                                                             #
# Pakfire - The IPFire package management system                              #
# Copyright (C) 2011 Pakfire development team                                 #
#                                                                             #
# This program is free software: you can redistribute it and/or modify        #
# it under the terms of the GNU General Public License as published by        #
# the Free Software Foundation, either version 3 of the License, or           #
# (at your option) any later version.                                         #
#                                                                             #
# This program is distributed in the hope that it will be useful,             #
# but WITHOUT ANY WARRANTY; without even the implied warranty of              #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
# GNU General Public License for more details.                                #
#                                                                             #
# You should have received a copy of the GNU General Public License           #
# along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
#                                                                             #
###############################################################################

import json
import os
import pycurl
import random

import logging
log = logging.getLogger("pakfire")

from config import _Config

import urlgrabber.grabber
from urlgrabber.grabber import URLGrabber, URLGrabError
from urlgrabber.mirror import MirrorGroup
from urlgrabber.progress import TextMeter

from pakfire.constants import *
from pakfire.i18n import _

class PakfireGrabber(URLGrabber):
	"""
		Class to make some modifications on the urlgrabber configuration.
	"""
	def __init__(self, pakfire, *args, **kwargs):
		kwargs.update({
			"quote" : 0,
			"user_agent" : "pakfire/%s" % PAKFIRE_VERSION,

			"ssl_verify_host" : False,
			"ssl_verify_peer" : False,
		})

		if isinstance(pakfire, _Config):
			config = pakfire
		else:
			config = pakfire.config
		self.config = config

		# Set throttle setting.
		bandwidth_throttle = config.get("downloader", "bandwidth_throttle")
		if bandwidth_throttle:
			try:
				bandwidth_throttle = int(bandwidth_throttle)
			except ValueError:
				log.error("Configuration value for bandwidth_throttle is invalid.")
				bandwidth_throttle = 0

			kwargs.update({ "throttle" : bandwidth_throttle })

		# Configure HTTP proxy.
		http_proxy = config.get("downloader", "http_proxy")
		if http_proxy:
			kwargs.update({ "proxies" : { "http" : http_proxy, "https" : http_proxy }})

		URLGrabber.__init__(self, *args, **kwargs)

	def fork(self):
		"""
			Reset Curl object after forking a process.
		"""
		# XXX this is a very ugly hack and fiddles around with the internals
		# or urlgrabber. We should not touch these, but apparently nobody
		# else uses multiple threads or processes to talk to their servers.
		# So we simply replace Curl with a new instance without closing
		# the old one. This should be fixed in urlgrabber and/or pycurl.
		urlgrabber.grabber._curl_cache = pycurl.Curl()

	def check_offline_mode(self):
		offline = self.config.get("downloader", "offline")
		if not offline:
			return

		raise OfflineModeError

	def urlread(self, filename, *args, **kwargs):
		self.check_offline_mode()

		# This is for older versions of urlgrabber which are packaged in Debian
		# and Ubuntu and cannot handle filenames as a normal Python string but need
		# a unicode string.
		return URLGrabber.urlread(self, filename.encode("utf-8"), *args, **kwargs)

	def urlopen(self, filename, *args, **kwargs):
		self.check_offline_mode()

		# However, urlopen requires the filename to be an ordinary string object.
		filename = str(filename)

		return URLGrabber.urlopen(self, filename, *args, **kwargs)


class PackageDownloader(PakfireGrabber):
	def __init__(self, pakfire, *args, **kwargs):
		kwargs.update({
			"progress_obj" : TextMeter(),
		})

		PakfireGrabber.__init__(self, pakfire, *args, **kwargs)


class MetadataDownloader(PakfireGrabber):
	def __init__(self, pakfire, *args, **kwargs):
		kwargs.update({
			"http_headers" : (('Pragma', 'no-cache'),),
		})

		PakfireGrabber.__init__(self, pakfire, *args, **kwargs)


class DatabaseDownloader(PackageDownloader):
	def __init__(self, pakfire, *args, **kwargs):
		kwargs.update({
			"http_headers" : (('Pragma', 'no-cache'),),
		})

		PackageDownloader.__init__(self, pakfire, *args, **kwargs)


class SourceDownloader(object):
	def __init__(self, pakfire, mirrors=None):
		self.pakfire = pakfire

		self.grabber = PakfireGrabber(
			self.pakfire,
			progress_obj = TextMeter(),
		)

		if mirrors:
			self.grabber = MirrorGroup(self.grabber,
				[{ "mirror" : m.encode("utf-8") } for m in mirrors])

	def download(self, files):
		existant_files = []
		download_files = []

		for file in files:
			filename = os.path.join(SOURCE_CACHE_DIR, file)
			log.debug("Checking existance of %s..." % filename)

			if os.path.exists(filename) and os.path.getsize(filename):
				log.debug("...exists!")
				existant_files.append(filename)
			else:
				log.debug("...does not exist!")
				download_files.append(filename)

		if download_files:
			log.info(_("Downloading source files:"))

			if self.pakfire.offline:
				raise OfflineModeError, _("Cannot download source code in offline mode.")

			# Create source download directory.
			if not os.path.exists(SOURCE_CACHE_DIR):
				os.makedirs(SOURCE_CACHE_DIR)

			for filename in download_files:
				try:
					self.grabber.urlgrab(os.path.basename(filename), filename=filename)
				except URLGrabError, e:
					# Remove partly downloaded file.
					try:
						os.unlink(filename)
					except OSError:
						pass

					raise DownloadError, "%s %s" % (os.path.basename(filename), e)

				# Check if the downloaded file was empty.
				if os.path.getsize(filename) == 0:
					# Remove the file and raise an error.
					os.unlink(filename)

					raise DownloadError, _("Downloaded empty file: %s") \
						% os.path.basename(filename)

			log.info("")

		return existant_files + download_files


class Mirror(object):
	def __init__(self, url, location=None, preferred=False):
		# Save URL of the mirror in full format
		self.url = url

		# Save the location (if given)
		self.location = location

		# Save preference
		self.preferred = False


class MirrorList(object):
	def __init__(self, pakfire, repo, mirrorlist):
		self.pakfire = pakfire
		self.repo = repo

		self.__mirrors = []

		# Save URL to more mirrors.
		self.mirrorlist = mirrorlist

	@property
	def base_mirror(self):
		if not self.repo.baseurl:
			return

		return Mirror(self.repo.baseurl, preferred=False)

	@property
	def distro(self):
		return self.repo.distro

	@property
	def cache(self):
		"""
			Shortcut to cache from repository.
		"""
		return self.repo.cache

	def update(self, force=False):
		# XXX should this be allowed?
		if not self.mirrorlist:
			return 

		# If the system is not online, we cannot download anything.
		if self.pakfire.offline:
			return

		log.debug("Updating mirrorlist for repository '%s' (force=%s)" % (self.repo.name, force))
		cache_filename = os.path.join("repodata", self.distro.sname, self.distro.release,
			self.repo.name, self.distro.arch, "mirrors")

		# Force the update if no mirrorlist is available.
		if not self.cache.exists(cache_filename):
			force = True

		if not force and self.cache.exists(cache_filename):
			age = self.cache.age(cache_filename)

			# If the age could be determined and is higher than 24h,
			# we force an update.
			if age and age > TIME_24H:
				force = True

		if force:
			g = MetadataDownloader(self.pakfire)

			try:
				mirrordata = g.urlread(self.mirrorlist, limit=MIRRORLIST_MAXSIZE)
			except URLGrabError, e:
				log.warning("Could not update the mirrorlist for repo '%s': %s" % (self.repo.name, e))
				return

			# XXX check for empty files or damaged output

			# Save new mirror data to cache.
			f = self.cache.open(cache_filename, "w")
			f.write(mirrordata)
			f.close()

		# Read mirrorlist from cache and parse it.
		self.forget_mirrors()
		with self.cache.open(cache_filename) as f:
			self.parse_mirrordata(f.read())

	def parse_mirrordata(self, data):
		data = json.loads(data)

		for mirror in data["mirrors"]:
			self.add_mirror(**mirror)

	def add_mirror(self, *args, **kwargs):
		mirror = Mirror(*args, **kwargs)

		self.__mirrors.append(mirror)

	def forget_mirrors(self):
		self.__mirrors = []

	@property
	def preferred(self):
		"""
			Return a generator for all mirrors that are preferred.
		"""
		for mirror in self.__mirrors:
			if mirror.preferred:
				yield mirror

	@property
	def non_preferred(self):
		"""
			Return a generator for all mirrors that are not preferred.
		"""
		for mirror in self.__mirrors:
			if not mirror.preferred:
				yield mirror

	@property
	def all(self):
		"""
			Return a generator for all mirrors.
		"""
		for mirror in self.__mirrors:
			yield mirror

	def group(self, grabber):
		"""
			Return a MirrorGroup object for the given grabber.
		"""
		# Make sure the mirrorlist is up to date.
		self.update()

		# A list of mirrors that is passed to MirrorGroup.
		mirrors = []

		# Add all preferred mirrors at the first place and shuffle them
		# that we will start at a random place.
		for mirror in self.preferred:
			mirrors.append({ "mirror" : mirror.url.encode("utf-8") })
		random.shuffle(mirrors)

		# All other mirrors are added as well and will only be used if all
		# preferred mirrors did not work.
		for mirror in self.all:
			mirror = { "mirror" : mirror.url.encode("utf-8") }
			if mirror in mirrors:
				continue

			mirrors.append(mirror)

		# Always add the base mirror if any.
		base_mirror = self.base_mirror
		if base_mirror:
			mirror = { "mirror" : base_mirror.url.encode("utf-8") }
			if not mirror in mirrors:
				mirrors.append(mirror)

		return MirrorGroup(grabber, mirrors)


class Downloader(object):
	def __init__(self, mirrors, files):
		self.grabber = PakfireGrabber()

		self.mirrorgroup = mirrors.group(self.grabber)
Commit	Line	Data
1de8761d	1	#!/usr/bin/python
b792d887 MT	2	###############################################################################
	3	# #
	4	# Pakfire - The IPFire package management system #
	5	# Copyright (C) 2011 Pakfire development team #
	6	# #
	7	# This program is free software: you can redistribute it and/or modify #
	8	# it under the terms of the GNU General Public License as published by #
	9	# the Free Software Foundation, either version 3 of the License, or #
	10	# (at your option) any later version. #
	11	# #
	12	# This program is distributed in the hope that it will be useful, #
	13	# but WITHOUT ANY WARRANTY; without even the implied warranty of #
	14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
	15	# GNU General Public License for more details. #
	16	# #
	17	# You should have received a copy of the GNU General Public License #
	18	# along with this program. If not, see <http://www.gnu.org/licenses/>. #
	19	# #
	20	###############################################################################
1de8761d MT	21
1de8761d MT	22	import json
062699ee	23	import os
aa14071d	24	import pycurl
4f91860e	25	import random
1de8761d	26
8b6bc023 MT	27	import logging
	28	log = logging.getLogger("pakfire")
	29
a6bd96bc	30	from config import _Config
e57c5475	31
aa14071d	32	import urlgrabber.grabber
1de8761d	33	from urlgrabber.grabber import URLGrabber, URLGrabError
4f91860e	34	from urlgrabber.mirror import MirrorGroup
14ea3228	35	from urlgrabber.progress import TextMeter
1de8761d	36
a2d1644c	37	from pakfire.constants import *
062699ee	38	from pakfire.i18n import _
1de8761d MT	39
	40	class PakfireGrabber(URLGrabber):
	41	"""
	42	Class to make some modifications on the urlgrabber configuration.
	43	"""
80104a80	44	def __init__(self, pakfire, args, *kwargs):
14ea3228 MT	45	kwargs.update({
	46	"quote" : 0,
	47	"user_agent" : "pakfire/%s" % PAKFIRE_VERSION,
a6bd96bc MT	48
	49	"ssl_verify_host" : False,
	50	"ssl_verify_peer" : False,
14ea3228 MT	51	})
14ea3228 MT	52
a6bd96bc	53	if isinstance(pakfire, _Config):
e57c5475 MT	54	config = pakfire
	55	else:
	56	config = pakfire.config
98733451	57	self.config = config
6a509182	58
cfc16a71	59	# Set throttle setting.
a6bd96bc	60	bandwidth_throttle = config.get("downloader", "bandwidth_throttle")
80104a80 MT	61	if bandwidth_throttle:
	62	try:
	63	bandwidth_throttle = int(bandwidth_throttle)
	64	except ValueError:
8b6bc023	65	log.error("Configuration value for bandwidth_throttle is invalid.")
80104a80 MT	66	bandwidth_throttle = 0
	67
	68	kwargs.update({ "throttle" : bandwidth_throttle })
	69
cfc16a71	70	# Configure HTTP proxy.
a6bd96bc	71	http_proxy = config.get("downloader", "http_proxy")
cfc16a71	72	if http_proxy:
c611f46b	73	kwargs.update({ "proxies" : { "http" : http_proxy, "https" : http_proxy }})
cfc16a71	74
14ea3228 MT	75	URLGrabber.__init__(self, args, *kwargs)
14ea3228 MT	76
aa14071d MT	77	def fork(self):
	78	"""
	79	Reset Curl object after forking a process.
	80	"""
	81	# XXX this is a very ugly hack and fiddles around with the internals
	82	# or urlgrabber. We should not touch these, but apparently nobody
	83	# else uses multiple threads or processes to talk to their servers.
	84	# So we simply replace Curl with a new instance without closing
	85	# the old one. This should be fixed in urlgrabber and/or pycurl.
	86	urlgrabber.grabber._curl_cache = pycurl.Curl()
	87
98733451 MT	88	def check_offline_mode(self):
	89	offline = self.config.get("downloader", "offline")
	90	if not offline:
	91	return
	92
	93	raise OfflineModeError
	94
4efe0da7	95	def urlread(self, filename, args, *kwargs):
98733451 MT	96	self.check_offline_mode()
98733451 MT	97
4efe0da7 MT	98	# This is for older versions of urlgrabber which are packaged in Debian
	99	# and Ubuntu and cannot handle filenames as a normal Python string but need
	100	# a unicode string.
	101	return URLGrabber.urlread(self, filename.encode("utf-8"), args, *kwargs)
	102
0f8d6745	103	def urlopen(self, filename, args, *kwargs):
98733451 MT	104	self.check_offline_mode()
98733451 MT	105
0f8d6745 MT	106	# However, urlopen requires the filename to be an ordinary string object.
	107	filename = str(filename)
	108
	109	return URLGrabber.urlopen(self, filename, args, *kwargs)
	110
14ea3228 MT	111
14ea3228 MT	112	class PackageDownloader(PakfireGrabber):
80104a80	113	def __init__(self, pakfire, args, *kwargs):
14ea3228	114	kwargs.update({
ca38a577	115	"progress_obj" : TextMeter(),
14ea3228 MT	116	})
14ea3228 MT	117
80104a80	118	PakfireGrabber.__init__(self, pakfire, args, *kwargs)
14ea3228 MT	119
	120
	121	class MetadataDownloader(PakfireGrabber):
80104a80	122	def __init__(self, pakfire, args, *kwargs):
14ea3228 MT	123	kwargs.update({
	124	"http_headers" : (('Pragma', 'no-cache'),),
	125	})
	126
80104a80	127	PakfireGrabber.__init__(self, pakfire, args, *kwargs)
14ea3228 MT	128
	129
	130	class DatabaseDownloader(PackageDownloader):
80104a80	131	def __init__(self, pakfire, args, *kwargs):
14ea3228 MT	132	kwargs.update({
	133	"http_headers" : (('Pragma', 'no-cache'),),
	134	})
	135
80104a80	136	PackageDownloader.__init__(self, pakfire, args, *kwargs)
1de8761d	137
4f91860e	138
062699ee MT	139	class SourceDownloader(object):
	140	def __init__(self, pakfire, mirrors=None):
	141	self.pakfire = pakfire
	142
	143	self.grabber = PakfireGrabber(
	144	self.pakfire,
	145	progress_obj = TextMeter(),
	146	)
	147
	148	if mirrors:
	149	self.grabber = MirrorGroup(self.grabber,
4efe0da7	150	[{ "mirror" : m.encode("utf-8") } for m in mirrors])
062699ee MT	151
	152	def download(self, files):
	153	existant_files = []
	154	download_files = []
	155
	156	for file in files:
	157	filename = os.path.join(SOURCE_CACHE_DIR, file)
b76f5f47	158	log.debug("Checking existance of %s..." % filename)
062699ee	159
9ddb19b9	160	if os.path.exists(filename) and os.path.getsize(filename):
b76f5f47	161	log.debug("...exists!")
062699ee MT	162	existant_files.append(filename)
062699ee MT	163	else:
b76f5f47	164	log.debug("...does not exist!")
062699ee MT	165	download_files.append(filename)
	166
	167	if download_files:
8b6bc023	168	log.info(_("Downloading source files:"))
062699ee	169
98733451 MT	170	if self.pakfire.offline:
	171	raise OfflineModeError, _("Cannot download source code in offline mode.")
	172
062699ee MT	173	# Create source download directory.
	174	if not os.path.exists(SOURCE_CACHE_DIR):
	175	os.makedirs(SOURCE_CACHE_DIR)
	176
	177	for filename in download_files:
	178	try:
	179	self.grabber.urlgrab(os.path.basename(filename), filename=filename)
	180	except URLGrabError, e:
08de9306 MT	181	# Remove partly downloaded file.
	182	try:
	183	os.unlink(filename)
	184	except OSError:
	185	pass
	186
062699ee MT	187	raise DownloadError, "%s %s" % (os.path.basename(filename), e)
062699ee MT	188
9ddb19b9 MT	189	# Check if the downloaded file was empty.
	190	if os.path.getsize(filename) == 0:
	191	# Remove the file and raise an error.
	192	os.unlink(filename)
	193
	194	raise DownloadError, _("Downloaded empty file: %s") \
	195	% os.path.basename(filename)
	196
8b6bc023	197	log.info("")
062699ee MT	198
	199	return existant_files + download_files
	200
	201
1de8761d	202	class Mirror(object):
4f91860e	203	def __init__(self, url, location=None, preferred=False):
1de8761d	204	# Save URL of the mirror in full format
4f91860e	205	self.url = url
1de8761d MT	206
	207	# Save the location (if given)
	208	self.location = location
	209
	210	# Save preference
	211	self.preferred = False
	212
	213
	214	class MirrorList(object):
0f8d6745	215	def __init__(self, pakfire, repo, mirrorlist):
1de8761d MT	216	self.pakfire = pakfire
	217	self.repo = repo
	218
	219	self.__mirrors = []
	220
	221	# Save URL to more mirrors.
0f8d6745 MT	222	self.mirrorlist = mirrorlist
	223
	224	@property
	225	def base_mirror(self):
	226	if not self.repo.baseurl:
	227	return
	228
	229	return Mirror(self.repo.baseurl, preferred=False)
1de8761d	230
5a99898b MT	231	@property
	232	def distro(self):
	233	return self.repo.distro
	234
1de8761d MT	235	@property
	236	def cache(self):
	237	"""
	238	Shortcut to cache from repository.
	239	"""
	240	return self.repo.cache
	241
	242	def update(self, force=False):
	243	# XXX should this be allowed?
	244	if not self.mirrorlist:
	245	return
	246
c07a3ca7 MT	247	# If the system is not online, we cannot download anything.
	248	if self.pakfire.offline:
	249	return
	250
8b6bc023	251	log.debug("Updating mirrorlist for repository '%s' (force=%s)" % (self.repo.name, force))
5a99898b MT	252	cache_filename = os.path.join("repodata", self.distro.sname, self.distro.release,
5a99898b MT	253	self.repo.name, self.distro.arch, "mirrors")
1de8761d MT	254
	255	# Force the update if no mirrorlist is available.
	256	if not self.cache.exists(cache_filename):
	257	force = True
	258
	259	if not force and self.cache.exists(cache_filename):
	260	age = self.cache.age(cache_filename)
	261
	262	# If the age could be determined and is higher than 24h,
	263	# we force an update.
	264	if age and age > TIME_24H:
	265	force = True
	266
	267	if force:
80104a80	268	g = MetadataDownloader(self.pakfire)
1de8761d MT	269
	270	try:
	271	mirrordata = g.urlread(self.mirrorlist, limit=MIRRORLIST_MAXSIZE)
	272	except URLGrabError, e:
8b6bc023	273	log.warning("Could not update the mirrorlist for repo '%s': %s" % (self.repo.name, e))
1de8761d MT	274	return
	275
	276	# XXX check for empty files or damaged output
	277
	278	# Save new mirror data to cache.
	279	f = self.cache.open(cache_filename, "w")
	280	f.write(mirrordata)
	281	f.close()
	282
	283	# Read mirrorlist from cache and parse it.
90919c62	284	self.forget_mirrors()
1de8761d MT	285	with self.cache.open(cache_filename) as f:
	286	self.parse_mirrordata(f.read())
	287
	288	def parse_mirrordata(self, data):
	289	data = json.loads(data)
	290
	291	for mirror in data["mirrors"]:
	292	self.add_mirror(**mirror)
	293
	294	def add_mirror(self, args, *kwargs):
	295	mirror = Mirror(args, *kwargs)
	296
	297	self.__mirrors.append(mirror)
	298
90919c62 MT	299	def forget_mirrors(self):
	300	self.__mirrors = []
	301
1de8761d MT	302	@property
	303	def preferred(self):
	304	"""
	305	Return a generator for all mirrors that are preferred.
	306	"""
	307	for mirror in self.__mirrors:
	308	if mirror.preferred:
	309	yield mirror
	310
4f91860e MT	311	@property
	312	def non_preferred(self):
	313	"""
	314	Return a generator for all mirrors that are not preferred.
	315	"""
	316	for mirror in self.__mirrors:
	317	if not mirror.preferred:
	318	yield mirror
	319
1de8761d MT	320	@property
	321	def all(self):
	322	"""
	323	Return a generator for all mirrors.
	324	"""
	325	for mirror in self.__mirrors:
	326	yield mirror
	327
4f91860e MT	328	def group(self, grabber):
	329	"""
	330	Return a MirrorGroup object for the given grabber.
	331	"""
90919c62 MT	332	# Make sure the mirrorlist is up to date.
	333	self.update()
	334
4f91860e MT	335	# A list of mirrors that is passed to MirrorGroup.
	336	mirrors = []
	337
	338	# Add all preferred mirrors at the first place and shuffle them
	339	# that we will start at a random place.
	340	for mirror in self.preferred:
9a1da36a	341	mirrors.append({ "mirror" : mirror.url.encode("utf-8") })
4f91860e MT	342	random.shuffle(mirrors)
	343
	344	# All other mirrors are added as well and will only be used if all
	345	# preferred mirrors did not work.
	346	for mirror in self.all:
9a1da36a MT	347	mirror = { "mirror" : mirror.url.encode("utf-8") }
9a1da36a MT	348	if mirror in mirrors:
4f91860e MT	349	continue
4f91860e MT	350
9a1da36a MT	351	mirrors.append(mirror)
	352
	353	# Always add the base mirror if any.
	354	base_mirror = self.base_mirror
	355	if base_mirror:
	356	mirror = { "mirror" : base_mirror.url.encode("utf-8") }
	357	if not mirror in mirrors:
	358	mirrors.append(mirror)
4f91860e MT	359
	360	return MirrorGroup(grabber, mirrors)
	361
	362
	363
	364	class Downloader(object):
	365	def __init__(self, mirrors, files):
	366	self.grabber = PakfireGrabber()
	367
	368	self.mirrorgroup = mirrors.group(self.grabber)
	369
	370