[ipfire.org.git] / src / backend / wiki.py

#!/usr/bin/python3

import difflib
import hashlib
import logging
import markdown
import markdown.extensions
import markdown.preprocessors
import os.path
import re
import urllib.parse

from . import misc
from . import util
from .decorators import *

class Wiki(misc.Object):
	def _get_pages(self, query, *args):
		res = self.db.query(query, *args)

		for row in res:
			yield Page(self.backend, row.id, data=row)

	def _get_page(self, query, *args):
		res = self.db.get(query, *args)

		if res:
			return Page(self.backend, res.id, data=res)

	def __iter__(self):
		return self._get_pages("""
			SELECT
				wiki.*
			FROM
				wiki_current current
			LEFT JOIN
				wiki ON current.id = wiki.id
			WHERE
				current.deleted IS FALSE
			ORDER BY page
			""",
		)

	def make_path(self, page, path):
		# Nothing to do for absolute links
		if path.startswith("/"):
			pass

		# Relative links (one-level down)
		elif path.startswith("./"):
			path = os.path.join(page, path)

		# All other relative links
		else:
			p = os.path.dirname(page)
			path = os.path.join(p, path)

		# Normalise links
		return os.path.normpath(path)

	def _make_url(self, path):
		"""
			Composes the URL out of the path
		"""
		# Remove any leading slashes (if present)
		path = path.removeprefix("/")

		return os.path.join("/docs", path)

	def get_page_title(self, page, default=None):
		doc = self.get_page(page)
		if doc:
			title = doc.title
		else:
			title = os.path.basename(page)

		return title

	def get_page(self, page, revision=None):
		page = Page.sanitise_page_name(page)

		# Split the path into parts
		parts = page.split("/")

		# Check if this is an action
		if any((part.startswith("_") for part in parts)):
			return

		if revision:
			return self._get_page("SELECT * FROM wiki WHERE page = %s \
				AND timestamp = %s", page, revision)
		else:
			return self._get_page("SELECT * FROM wiki WHERE page = %s \
				ORDER BY timestamp DESC LIMIT 1", page)

	def get_recent_changes(self, account, limit=None):
		pages = self._get_pages("SELECT * FROM wiki \
			ORDER BY timestamp DESC")

		for page in pages:
			if not page.check_acl(account):
				continue

			yield page

			limit -= 1
			if not limit:
				break

	def create_page(self, page, author, content, changes=None, address=None):
		page = Page.sanitise_page_name(page)

		# Write page to the database
		page = self._get_page("""
			INSERT INTO
				wiki
			(
				page,
				author_uid,
				markdown,
				changes,
				address
			) VALUES (
				%s, %s, %s, %s, %s
			)
			RETURNING *
			""", page, author.uid, content or None, changes, address,
		)

		# Store any linked files
		page._store_linked_files()

		# Send email to all watchers
		page._send_watcher_emails(excludes=[author])

		return page

	def delete_page(self, page, author, **kwargs):
		# Do nothing if the page does not exist
		if not self.get_page(page):
			return

		# Just creates a blank last version of the page
		self.create_page(page, author=author, content=None, **kwargs)

	def make_breadcrumbs(self, path):
		ret = []

		while path:
			# Cut off everything after the last slash
			path, _, _ = path.rpartition("/")

			# Do not include the root
			if not path:
				break

			# Find the page
			page = self.get_page(path)

			# Append the URL and title to the output
			ret.append((
				page.url if page else self._make_url(path),
				page.title if page else os.path.basename(path),
			))

		# Return the breadcrumbs in order
		return reversed(ret)

	def search(self, query, account=None, limit=None):
		res = self._get_pages("""
			SELECT
				wiki.*
			FROM
				wiki_search_index search_index
			LEFT JOIN
				wiki ON search_index.wiki_id = wiki.id
			WHERE
				search_index.document @@ websearch_to_tsquery('english', %s)
			ORDER BY
				ts_rank(search_index.document, websearch_to_tsquery('english', %s)) DESC
			""", query, query,
		)

		pages = []
		for page in res:
			# Skip any pages the user doesn't have permission for
			if not page.check_acl(account):
				continue

			# Return any other pages
			pages.append(page)

			# Break when we have found enough pages
			if limit and len(pages) >= limit:
				break

		return pages

	def refresh(self):
		"""
			Needs to be called after a page has been changed
		"""
		self.db.execute("REFRESH MATERIALIZED VIEW CONCURRENTLY wiki_search_index")

	def get_watchlist(self, account):
		pages = self._get_pages("""
			WITH pages AS (
				SELECT
					*
				FROM
					wiki_current
				LEFT JOIN
					wiki ON wiki_current.id = wiki.id
			)

			SELECT
				*
			FROM
				wiki_watchlist watchlist
			JOIN
				pages ON watchlist.page = pages.page
			WHERE
				watchlist.uid = %s
			""", account.uid,
		)

		return sorted(pages)

	# ACL

	def check_acl(self, page, account):
		res = self.db.query("""
			SELECT
				*
			FROM
				wiki_acls
			WHERE
				%s ILIKE (path || '%%')
			ORDER BY
				LENGTH(path) DESC
			LIMIT 1
			""", page,
		)

		for row in res:
			# Access not permitted when user is not logged in
			if not account:
				return False

			# If user is in a matching group, we grant permission
			for group in row.groups:
				if account.is_member_of_group(group):
					return True

			# Otherwise access is not permitted
			return False

		# If no ACLs are found, we permit access
		return True

	# Files

	def _get_files(self, query, *args):
		res = self.db.query(query, *args)

		for row in res:
			yield File(self.backend, row.id, data=row)

	def _get_file(self, query, *args):
		res = self.db.get(query, *args)

		if res:
			return File(self.backend, res.id, data=res)

	def get_files(self, path):
		files = self._get_files("""
			SELECT
				*
			FROM
				wiki_files
			WHERE
				path = %s
			AND
				deleted_at IS NULL
			ORDER BY filename
			""", path,
		)

		return list(files)

	def get_file_by_path(self, path, revision=None):
		path, filename = os.path.dirname(path), os.path.basename(path)

		if revision:
			# Fetch a specific revision
			return self._get_file("""
				SELECT
					*
				FROM
					wiki_files
				WHERE
					path = %s
				AND
					filename = %s
				AND
					created_at <= %s
				ORDER BY
					created_at DESC
					LIMIT 1
				""", path, filename, revision,
			)

		# Fetch latest version
		return self._get_file("""
			SELECT
				*
			FROM
				wiki_files
			WHERE
				path = %s
			AND
				filename = %s
			AND
				deleted_at IS NULL
			""", path, filename,
		)

	def get_file_by_path_and_filename(self, path, filename):
		return self._get_file("""
			SELECT
				*
			FROM
				wiki_files
			WHERE
				path = %s
			AND
				filename = %s
			AND
				deleted_at IS NULL
			""", path, filename,
		)

	def upload(self, path, filename, data, mimetype, author, address):
		# Replace any existing files
		file = self.get_file_by_path_and_filename(path, filename)
		if file:
			file.delete(author)

		# Upload the blob first
		blob = self.db.get("""
			INSERT INTO
				wiki_blobs(data)
			VALUES
				(%s)
			ON CONFLICT
				(digest(data, %s))
			DO UPDATE
				SET data = EXCLUDED.data
			RETURNING id
			""", data, "MD5",
		)

		# Create entry for file
		return self._get_file("""
			INSERT INTO
				wiki_files
			(
				path,
				filename,
				author_uid,
				address,
				mimetype,
				blob_id,
				size
			) VALUES (
				%s, %s, %s, %s, %s, %s, %s
			)
			RETURNING *
			""", path, filename, author.uid, address, mimetype, blob.id, len(data),
		)

	def render(self, path, text, **kwargs):
		return WikiRenderer(self.backend, path, text, **kwargs)


class Page(misc.Object):
	def init(self, id, data=None):
		self.id = id
		self.data = data

	def __repr__(self):
		return "<%s %s %s>" % (self.__class__.__name__, self.page, self.timestamp)

	def __eq__(self, other):
		if isinstance(other, self.__class__):
			return self.id == other.id

		return NotImplemented

	def __lt__(self, other):
		if isinstance(other, self.__class__):
			if self.page == other.page:
				return self.timestamp < other.timestamp

			return self.page < other.page

		return NotImplemented

	def __hash__(self):
		return hash(self.page)

	@staticmethod
	def sanitise_page_name(page):
		if not page:
			return "/"

		# Make sure that the page name does NOT end with a /
		if page.endswith("/"):
			page = page[:-1]

		# Make sure the page name starts with a /
		if not page.startswith("/"):
			page = "/%s" % page

		# Remove any double slashes
		page = page.replace("//", "/")

		return page

	@property
	def url(self):
		return self.backend.wiki._make_url(self.page)

	@property
	def full_url(self):
		return "https://www.ipfire.org%s" % self.url

	@property
	def page(self):
		return self.data.page

	@property
	def title(self):
		return self._title or os.path.basename(self.page[1:])

	@property
	def _title(self):
		if not self.markdown:
			return

		# Find first H1 headline in markdown
		markdown = self.markdown.splitlines()

		m = re.match(r"^#\s*(.*)( #)?$", markdown[0])
		if m:
			return m.group(1)

	@lazy_property
	def author(self):
		if self.data.author_uid:
			return self.backend.accounts.get_by_uid(self.data.author_uid)

	@property
	def markdown(self):
		return self.data.markdown or ""

	@property
	def html(self):
		lines = []

		# Strip off the first line if it contains a heading (as it will be shown separately)
		for i, line in enumerate(self.markdown.splitlines()):
			if i == 0 and line.startswith("#"):
				continue

			lines.append(line)

		renderer = self.backend.wiki.render(self.page, "\n".join(lines), revision=self.timestamp)

		return renderer.html

	# Linked Files

	@property
	def files(self):
		renderer = self.backend.wiki.render(self.page, self.markdown, revision=self.timestamp)

		return renderer.files

	def _store_linked_files(self):
		self.db.executemany("INSERT INTO wiki_linked_files(page_id, path) \
			VALUES(%s, %s)", ((self.id, file) for file in self.files))

	@property
	def timestamp(self):
		return self.data.timestamp

	def was_deleted(self):
		return not self.markdown

	@lazy_property
	def breadcrumbs(self):
		return self.backend.wiki.make_breadcrumbs(self.page)

	def is_latest_revision(self):
		return self.get_latest_revision() == self

	def get_latest_revision(self):
		revisions = self.get_revisions()

		# Return first object
		for rev in revisions:
			return rev

	def get_revisions(self):
		return self.backend.wiki._get_pages("SELECT * FROM wiki \
			WHERE page = %s ORDER BY timestamp DESC", self.page)

	@lazy_property
	def previous_revision(self):
		return self.backend.wiki._get_page("SELECT * FROM wiki \
			WHERE page = %s AND timestamp < %s ORDER BY timestamp DESC \
			LIMIT 1", self.page, self.timestamp)

	@property
	def changes(self):
		return self.data.changes

	# ACL

	def check_acl(self, account):
		return self.backend.wiki.check_acl(self.page, account)

	# Watchers

	@lazy_property
	def diff(self):
		if self.previous_revision:
			diff = difflib.unified_diff(
				self.previous_revision.markdown.splitlines(),
				self.markdown.splitlines(),
			)

			return "\n".join(diff)

	@property
	def watchers(self):
		res = self.db.query("SELECT uid FROM wiki_watchlist \
			WHERE page = %s", self.page)

		for row in res:
			# Search for account by UID and skip if none was found
			account = self.backend.accounts.get_by_uid(row.uid)
			if not account:
				continue

			# Return the account
			yield account

	def is_watched_by(self, account):
		res = self.db.get("SELECT 1 FROM wiki_watchlist \
			WHERE page = %s AND uid = %s", self.page, account.uid)

		if res:
			return True

		return False

	def add_watcher(self, account):
		if self.is_watched_by(account):
			return

		self.db.execute("INSERT INTO wiki_watchlist(page, uid) \
			VALUES(%s, %s)", self.page, account.uid)

	def remove_watcher(self, account):
		self.db.execute("DELETE FROM wiki_watchlist \
			WHERE page = %s AND uid = %s", self.page, account.uid)

	def _send_watcher_emails(self, excludes=[]):
		# Nothing to do if there was no previous revision
		if not self.previous_revision:
			return

		for watcher in self.watchers:
			# Skip everyone who is excluded
			if watcher in excludes:
				logging.debug("Excluding %s" % watcher)
				continue

			# Check permissions
			if not self.backend.wiki.check_acl(self.page, watcher):
				logging.debug("Watcher %s does not have permissions" % watcher)
				continue

			logging.debug("Sending watcher email to %s" % watcher)

			# Compose message
			watcher.send_message("wiki/messages/page-changed", page=self, priority=-10)

	def restore(self, author, address, comment=None):
		changes = "Restore to revision from %s" % self.timestamp.isoformat()

		# Append comment
		if comment:
			changes = "%s: %s" % (changes, comment)

		return self.backend.wiki.create_page(self.page,
			author, self.markdown, changes=changes, address=address)


class File(misc.Object):
	def init(self, id, data):
		self.id   = id
		self.data = data

	def __eq__(self, other):
		if isinstance(other, self.__class__):
			return self.id == other.id

		return NotImplemented

	@property
	def url(self):
		return "/docs%s" % os.path.join(self.path, self.filename)

	@property
	def path(self):
		return self.data.path

	@property
	def filename(self):
		return self.data.filename

	@property
	def mimetype(self):
		return self.data.mimetype

	@property
	def size(self):
		return self.data.size

	@lazy_property
	def author(self):
		if self.data.author_uid:
			return self.backend.accounts.get_by_uid(self.data.author_uid)

	@property
	def created_at(self):
		return self.data.created_at

	timestamp = created_at

	def delete(self, author=None):
		if not self.can_be_deleted():
			raise RuntimeError("Cannot delete %s" % self)

		self.db.execute("UPDATE wiki_files SET deleted_at = NOW(), deleted_by = %s \
			WHERE id = %s", author.uid if author else None, self.id)

	def can_be_deleted(self):
		# Cannot be deleted if still in use
		if self.pages:
			return False

		# Can be deleted
		return True

	@property
	def deleted_at(self):
		return self.data.deleted_at

	def get_latest_revision(self):
		revisions = self.get_revisions()

		# Return first object
		for rev in revisions:
			return rev

	def get_revisions(self):
		revisions = self.backend.wiki._get_files("SELECT * FROM wiki_files \
			WHERE path = %s AND filename = %s ORDER BY created_at DESC", self.path, self.filename)

		return list(revisions)

	def is_pdf(self):
		return self.mimetype in ("application/pdf", "application/x-pdf")

	def is_image(self):
		return self.mimetype.startswith("image/")

	def is_vector_image(self):
		return self.mimetype in ("image/svg+xml",)

	def is_bitmap_image(self):
		return self.is_image() and not self.is_vector_image()

	@lazy_property
	def blob(self):
		res = self.db.get("SELECT data FROM wiki_blobs \
			WHERE id = %s", self.data.blob_id)

		if res:
			return bytes(res.data)

	async def get_thumbnail(self, size, format=None):
		assert self.is_bitmap_image()

		# Let thumbnails live in the cache for up to 24h
		ttl = 24 * 3600

		cache_key = ":".join((
			"wiki",
			"thumbnail",
			self.path,
			util.normalize(self.filename),
			self.created_at.isoformat(),
			format or "N/A",
			"%spx" % size,
		))

		# Try to fetch the data from the cache
		async with await self.backend.cache.pipeline() as p:
			# Fetch the key
			await p.get(cache_key)

			# Reset the TTL
			await p.expire(cache_key, ttl)

			# Execute the pipeline
			thumbnail, _ = await p.execute()

		# Return the cached value
		if thumbnail:
			return thumbnail

		# Generate the thumbnail
		thumbnail = util.generate_thumbnail(self.blob, size, format=format, quality=95)

		# Put it into the cache for 24h
		await self.backend.cache.set(cache_key, thumbnail, ttl)

		return thumbnail

	@property
	def pages(self):
		"""
			Returns a list of all pages this file is linked by
		"""
		pages = self.backend.wiki._get_pages("""
			SELECT
				wiki.*
			FROM
				wiki_linked_files
			JOIN
				wiki_current ON wiki_linked_files.page_id = wiki_current.id
			LEFT JOIN
				wiki ON wiki_linked_files.page_id = wiki.id
			WHERE
				wiki_linked_files.path = %s
			ORDER BY
				wiki.page
			""", os.path.join(self.path, self.filename),
		)

		return list(pages)


class WikiRenderer(misc.Object):
	schemas = (
		"ftp://",
		"git://",
		"http://",
		"https://",
		"rsync://",
		"sftp://",
		"ssh://",
		"webcal://",
	)

	# Links
	_links = re.compile(r"<a href=\"(.*?)\">(.*?)</a>")

	# Images
	_images = re.compile(r"<img alt(?:=\"(.*?)\")? src=\"(.*?)\" (?:title=\"(.*?)\" )?/>")

	def init(self, path, text, revision=None):
		self.path = path
		self.text = text

		# Optionally, the revision of the rendered page
		self.revision = revision

		# Markdown Renderer
		self.renderer = Markdown(
			self.backend,
			extensions=[
				LinkedFilesExtractorExtension(),
				PrettyLinksExtension(),
				"codehilite",
				"fenced_code",
				"footnotes",
				"nl2br",
				"sane_lists",
				"tables",
				"toc",
			],
		)

		# Render!
		self.html = self._render()

	def _render_link(self, m):
		url, text = m.groups()

		# Treat linkes starting with a double slash as absolute
		if url.startswith("//"):
			# Remove the double-lash
			url = url.removeprefix("/")

			# Return a link
			return """<a href="%s">%s</a>""" % (url, text or url)

		# External Links
		for schema in self.schemas:
			if url.startswith(schema):
				return """<a class="link-external" href="%s">%s</a>""" % \
					(url, text or url)

		# Emails
		if "@" in url:
			# Strip mailto:
			if url.startswith("mailto:"):
				url = url[7:]

			return """<a class="link-external" href="mailto:%s">%s</a>""" % \
				(url, text or url)

		# Everything else must be an internal link
		path = self.backend.wiki.make_path(self.path, url)

		return """<a href="/docs%s">%s</a>""" % \
			(path, text or self.backend.wiki.get_page_title(path))

	def _render_image(self, m):
		alt_text, url, caption = m.groups()

		# Compute a hash over the URL
		h = hashlib.new("md5")
		h.update(url.encode())
		id = h.hexdigest()

		html = """
			<div class="columns is-centered">
				<div class="column is-8">
					<figure class="image modal-trigger" data-target="%(id)s">
						<img src="/docs%(url)s?s=960&amp;%(args)s" alt="%(caption)s">

						<figcaption class="figure-caption">%(caption)s</figcaption>
					</figure>

					<div class="modal is-large" id="%(id)s">
						<div class="modal-background"></div>

						<div class="modal-content">
							<p class="image">
								<img src="/docs%(url)s?s=2048&amp;%(args)s" alt="%(caption)s"
									loading="lazy">
							</p>

							<a class="button is-small" href="/docs%(url)s?action=detail">
								<span class="icon">
									<i class="fa-solid fa-circle-info"></i>
								</span>
							</a>
						</div>

						<button class="modal-close is-large" aria-label="close"></button>
					</div>
				</div>
			</div>
		"""

		# Try to split query string
		url, delimiter, qs = url.partition("?")

		# Parse query arguments
		args = urllib.parse.parse_qs(qs)

		# Skip any absolute and external URLs
		if url.startswith("https://") or url.startswith("http://"):
			return html % {
				"caption"   : caption or "",
				"id"        : id,
				"url"       : url,
				"args"      : args,
			}

		# Build absolute path
		url = self.backend.wiki.make_path(self.path, url)

		# Find image
		file = self.backend.wiki.get_file_by_path(url, revision=self.revision)
		if not file or not file.is_image():
			return "<!-- Could not find image %s in %s -->" % (url, self.path)

		# Remove any requested size
		if "s" in args:
			del args["s"]

		# Link the image that has been the current version at the time of the page edit
		if file:
			args["revision"] = file.timestamp

		return html % {
			"caption"   : caption or "",
			"id"        : id,
			"url"       : url,
			"args"      : urllib.parse.urlencode(args),
		}

	def _render(self):
		logging.debug("Rendering %s" % self.path)

		# Render...
		text = self.renderer.convert(self.text)

		# Postprocess links
		text = self._links.sub(self._render_link, text)

		# Postprocess images to <figure>
		text = self._images.sub(self._render_image, text)

		return text

	@lazy_property
	def files(self):
		"""
			A list of all linked files that have been part of the rendered markup
		"""
		files = []

		for url in self.renderer.files:
			# Skip external images
			if url.startswith("https://") or url.startswith("http://"):
				continue

			# Make the URL absolute
			url = self.backend.wiki.make_path(self.path, url)

			# Check if this is a file (it could also just be a page)
			file = self.backend.wiki.get_file_by_path(url)
			if file:
				files.append(url)

		return files


class Markdown(markdown.Markdown):
	def __init__(self, backend, *args, **kwargs):
		# Store the backend
		self.backend = backend

		# Call inherited setup routine
		super().__init__(*args, **kwargs)


class PrettyLinksExtension(markdown.extensions.Extension):
	def extendMarkdown(self, md):
		# Create links to Bugzilla
		md.preprocessors.register(BugzillaLinksPreprocessor(md), "bugzilla", 10)

		# Create links to CVE
		md.preprocessors.register(CVELinksPreprocessor(md), "cve", 10)

		# Link mentioned users
		md.preprocessors.register(UserMentionPreprocessor(md), "user-mention", 10)


class BugzillaLinksPreprocessor(markdown.preprocessors.Preprocessor):
	regex = re.compile(r"(?:#(\d{5,}))", re.I)

	def run(self, lines):
		for line in lines:
			yield self.regex.sub(r"[#\1](https://bugzilla.ipfire.org/show_bug.cgi?id=\1)", line)


class CVELinksPreprocessor(markdown.preprocessors.Preprocessor):
	regex = re.compile(r"(?:CVE)[\s\-](\d{4}\-\d+)")

	def run(self, lines):
		for line in lines:
			yield self.regex.sub(r"[CVE-\1](https://cve.mitre.org/cgi-bin/cvename.cgi?name=\1)", line)


class UserMentionPreprocessor(markdown.preprocessors.Preprocessor):
	regex = re.compile(r"\B@(\w+)")

	def run(self, lines):
		for line in lines:
			yield self.regex.sub(self._replace, line)

	def _replace(self, m):
		# Fetch the user's handle
		uid, = m.groups()

		# Fetch the user
		user = self.md.backend.accounts.get_by_uid(uid)

		# If the user was not found, we put back the matched text
		if not user:
			return m.group(0)

		# Link the user
		return "[%s](/users/%s)" % (user, user.uid)


class LinkedFilesExtractor(markdown.treeprocessors.Treeprocessor):
	"""
		Finds all Linked Files
	"""
	def __init__(self, *args, **kwargs):
		super().__init__(*args, **kwargs)

		self.md.files = []

	def run(self, root):
		# Find all images and store the URLs
		for image in root.findall(".//img"):
			src = image.get("src")

			self.md.files.append(src)

		# Find all links
		for link in root.findall(".//a"):
			href = link.get("href")

			self.md.files.append(href)


class LinkedFilesExtractorExtension(markdown.extensions.Extension):
    def extendMarkdown(self, md):
        md.treeprocessors.register(LinkedFilesExtractor(md), "linked-files-extractor", 10)