--- /dev/null
+from collections import Counter
+from google.protobuf import text_format
+from gflanguages import languages_public_pb2
+
+ATTRIBUTES = "base auxiliary marks punctuation index".split(" ")
+
+
+def main(args=None):
+ for path in args:
+ with open(path, encoding="utf-8") as fp:
+ language = text_format.Parse(
+ fp.read(), languages_public_pb2.LanguageProto()
+ )
+ changed = False
+ exemplar_values = {}
+ if not hasattr(language, "exemplar_chars"):
+ exit()
+ for attr in ATTRIBUTES:
+ if hasattr(language.exemplar_chars, attr):
+ values = getattr(language.exemplar_chars, attr).split(" ")
+ value_set = set()
+ clean_values = []
+ for value in values:
+ if value in value_set:
+ continue
+ else:
+ value_set.add(value)
+ clean_values.append(value)
+
+ if clean_values != values:
+ if {len(set(values))} != {len(set(clean_values))}:
+ print("before: " + " ".join(values))
+ print("after: " + " ".join(clean_values))
+ sys.exit("Failed fixing exemplar.")
+ setattr(language.exemplar_chars, attr, " ".join(clean_values))
+ changed = True
+ exemplar_values[attr] = {
+ "before": values,
+ "after": clean_values
+ }
+
+ if changed:
+ for exemplar, values in exemplar_values.items():
+ before = values["before"]
+ after = values["after"]
+ counter = Counter(before)
+ duplicates = [(g, c - 1) for g, c in counter.most_common() if c > 1]
+ print(
+ f"Changed {path} {exemplar} exemplar:\n"
+ f"- from {len(before)} ({len(set(before))} as set) "
+ f"to {len(after)} elements\n"
+ f"- removing {len(before) - len(after)} duplicate(s):\n"
+ f" {duplicates}\n"
+ )
+ with open(path, "w", encoding="utf-8") as fp:
+ fp.write(text_format.MessageToString(language, as_utf8=True))
+ fp.close()
+
+
+if __name__ == "__main__":
+ import sys
+
+ main(args=sys.argv[1:])