else:
return None
- def predict_custom_fields(self, content: str) -> list[int]:
+ def predict_custom_fields(self, content: str) -> dict:
+ """
+ Custom fields are a bit different from the other classifiers, as we
+ need to predict the values for the fields, not just the field itself.
+ """
+ # TODO: can this return the value?
from sklearn.utils.multiclass import type_of_target
if self.custom_fields_classifier:
)
-def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None):
+def match_custom_fields(
+ document: Document,
+ classifier: DocumentClassifier,
+ user=None,
+) -> dict:
+ """
+ Custom fields work differently, we need the values for the match as well.
+ """
+ # TODO: this needs to return values as well
predicted_custom_field_ids = (
classifier.predict_custom_fields(document.content) if classifier else []
)
fields = [instance.field for instance in document.custom_fields.all()]
- return list(
- filter(
- lambda o: matches(o, document)
- or (
- o.matching_algorithm == MatchingModel.MATCH_AUTO
- and o.pk in predicted_custom_field_ids
- ),
- fields,
- ),
- )
+ matched_fields = {}
+ for field in fields:
+ if field.matching_algorithm == MatchingModel.MATCH_AUTO:
+ if field.pk in predicted_custom_field_ids:
+ matched_fields[field] = None
+ elif field.matching_algorithm == MatchingModel.MATCH_REGEX:
+ try:
+ match = re.search(
+ re.compile(field.matching_model.match),
+ document.content,
+ )
+ if match:
+ matched_fields[field] = match.group()
+ except re.error:
+ logger.error(
+ f"Error while processing regular expression {field.matching_model.match}",
+ )
+ return False
+ if match:
+ log_reason(
+ field.matching_model,
+ document,
+ f"the string {match.group()} matches the regular expression "
+ f"{field.matching_model.match}",
+ )
+
+ return matched_fields
def matches(matching_model: MatchingModel, document: Document):
document: Document,
logging_group=None,
classifier: DocumentClassifier | None = None,
- replace=False,
- suggest=False,
base_url=None,
stdout=None,
style_func=None,
+ *,
+ replace=False,
+ suggest=False,
**kwargs,
):
if replace:
current_fields = set([instance.field for instance in document.custom_fields.all()])
- matched_fields = matching.match_custom_fields(document, classifier)
+ matched_fields_w_values: dict = matching.match_custom_fields(document, classifier)
+ matched_fields = matched_fields_w_values.keys()
relevant_fields = set(matched_fields) - current_fields
)
for field in relevant_fields:
+ args = {
+ "field": field,
+ "document": document,
+ }
+ if field.pk in matched_fields_w_values:
+ value_field_name = CustomFieldInstance.get_value_field_name(
+ data_type=field.data_type,
+ )
+ args[value_field_name] = matched_fields_w_values[field.pk]
CustomFieldInstance.objects.create(
- field=field,
- document=document,
+ **args,
)