From: Jason Ish Date: Tue, 5 Mar 2024 22:54:13 +0000 (-0600) Subject: datasets: use filename based on filename; not content X-Git-Tag: 1.3.1~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=935d361c669343d9fd76f45b159e682f285eb4fc;p=thirdparty%2Fsuricata-update.git datasets: use filename based on filename; not content By using a hash of the content, a new file was created everytime the dataset was updated and never cleaned up. To address this, use a filename that doesn't change based on the content. Bug: #6763 --- diff --git a/CHANGELOG.md b/CHANGELOG.md index 71f7d7f..a231f00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ instead of 4.0.0. - Handle URLs of bare files that don't end in .rules: https://redmine.openinfosecfoundation.org/issues/3664 +- Don't base dataset filenames on the contents of the file, but + instead the filename path: + https://redmine.openinfosecfoundation.org/issues/6763 ## 1.3.0 - 2023-07-07 diff --git a/suricata/update/main.py b/suricata/update/main.py index 9d2e36d..d41944e 100644 --- a/suricata/update/main.py +++ b/suricata/update/main.py @@ -465,9 +465,9 @@ def handle_dataset_files(rule, dep_files): return dataset_contents = dep_files[source_filename] - content_hash = hashlib.md5(dataset_contents).hexdigest() - new_rule = re.sub(r"(dataset.*?load\s+){}".format(dataset_filename), r"\g<1>datasets/{}".format(content_hash), rule.format()) - dest_filename = os.path.join(config.get_output_dir(), "datasets", content_hash) + source_filename_hash = hashlib.md5(source_filename.encode()).hexdigest() + new_rule = re.sub(r"(dataset.*?load\s+){}".format(dataset_filename), r"\g<1>datasets/{}".format(source_filename_hash), rule.format()) + dest_filename = os.path.join(config.get_output_dir(), "datasets", source_filename_hash) dest_dir = os.path.dirname(dest_filename) logger.debug("Copying dataset file {} to {}".format(dataset_filename, dest_filename)) try: