]> git.ipfire.org Git - dbl.git/commitdiff
sources: Refactor the ABP parser
authorMichael Tremer <michael.tremer@ipfire.org>
Fri, 2 Jan 2026 14:12:42 +0000 (14:12 +0000)
committerMichael Tremer <michael.tremer@ipfire.org>
Fri, 2 Jan 2026 14:12:42 +0000 (14:12 +0000)
Some sources don't implement the format very strictly. To not miss any
data, we will have to split off any options and then check whether we
want to add the domain to our lists.

Signed-off-by: Michael Tremer <michael.tremer@ipfire.org>
src/dnsbl/sources.py

index b58c311ef2b1b3b60b01320b6791e19384d9c2c8..dbf5a46a66544747438e182334a06aaa54107a3e 100644 (file)
@@ -422,14 +422,42 @@ class Source(sqlmodel.SQLModel, database.BackendMixin, table=True):
                """
                        Parse the domain from the AdBlockPlus format
                """
-               if line.startswith("||"):
-                       # Remove the leading ||
-                       line = line.removeprefix("||")
+               # Skip any comments
+               if line.startswith("!") or line.startswith("@@"):
+                       return
+
+               # Remove the leading ||
+               # Some files actually don't always use this
+               line = line.removeprefix("||")
+
+               # Split off any options
+               line, __, options = line.partition("$")
+
+               # Parse the options
+               if options:
+                       # Split options by comma and strip any whitespace
+                       options = [option.strip() for option in options.split(",")]
+
+                       # Remove some options that we know and will ignore
+                       for option in ("all", "important"):
+                               try:
+                                       options.remove(option)
+                               except ValueError:
+                                       pass
+
+               # Cut off everything after ^
+               line, __, rest = line.partition("^")
 
-                       # Cut off everything after ^
-                       domain, _, rest = line.partition("^")
+               # The line now should only contain the domain
+               domain = line
 
-                       return domain
+               # Ignore any domains that have unknown/unsupported options
+               if options:
+                       log.warning(_("Cannot parse domain %s with unknown options: %s") \
+                               % (domain, ",".join(options)))
+                       return
+
+               return domain
 
        def _process_hosts(self, line):
                """