Script for generating uninames.h from UnicodeDataNames.txt.

author Bruno Haible <bruno@clisp.org>

Wed, 6 Feb 2002 12:52:46 +0000 (12:52 +0000)

committer Bruno Haible <bruno@clisp.org>

Sun, 21 Jun 2009 23:16:57 +0000 (01:16 +0200)
author Bruno Haible <bruno@clisp.org>
Wed, 6 Feb 2002 12:52:46 +0000 (12:52 +0000)
committer Bruno Haible <bruno@clisp.org>
Sun, 21 Jun 2009 23:16:57 +0000 (01:16 +0200)
diff --git a/libuniname/gen-uninames b/libuniname/gen-uninames

new file mode 100755 (executable)

index 0000000..f32f004
--- /dev/null
+++ b/libuniname/gen-uninames
@@ -0,0 +1,291 @@
+#!/usr/local/bin/clisp -C
+
+;;; Creation of CLISP's uni_names.h from the UnicodeData.txt table.
+;;; Bruno Haible 2000-12-28
+
+(defparameter add-comments nil)
+
+(defstruct unicode-char
+  (code nil :type integer)
+  (name nil :type string)
+  word-indices
+  word-indices-index
+)
+
+(defstruct word-list
+  (hashed nil :type hash-table)
+  (sorted nil :type list)
+  size                          ; number of characters total
+  length                        ; number of words
+)
+
+(defun main (inputfile outputfile)
+  (declare (type string inputfile outputfile))
+  #+UNICODE (setq *default-file-encoding* charset:utf-8)
+  (let ((all-chars '()))
+    ;; Read all characters and names from the input file.
+    (with-open-file (istream inputfile :direction :input)
+      (loop
+        (let ((line (read-line istream nil nil)))
+          (unless line (return))
+          (let* ((i1 (position #\; line))
+                 (i2 (position #\; line :start (1+ i1)))
+                 (code-string (subseq line 0 i1))
+                 (code (parse-integer code-string :radix 16))
+                 (name-string (subseq line (1+ i1) i2)))
+            ; Ignore characters whose name starts with "<".
+            (unless (eql (char name-string 0) #\<)
+              ; Also ignore Hangul syllables; they are treated specially.
+              (unless (<= #xAC00 code #xD7A3)
+                ; Also ignore CJK compatibility ideographs; they are treated
+                ; specially as well.
+                (unless (or (<= #xF900 code #xFA2D) (<= #x2F800 code #x2FA1D))
+                  ; Transform the code so that it fits in 16 bits. In
+                  ; Unicode 3.1 the following ranges are used.
+                  ;   0x00000..0x033FF  >>12=  0x00..0x03  -> 0x0..0x3
+                  ;   0x0A000..0x0A4FF  >>12=  0x0A        -> 0x4
+                  ;   0x0F900..0x0FFFF  >>12=  0x0F        -> 0x5
+                  ;   0x10300..0x104FF  >>12=  0x10        -> 0x6
+                  ;   0x1D000..0x1D7DD  >>12=  0x1D        -> 0x7
+                  ;   0x2F800..0x2FAFF  >>12=  0x2F        -> 0x8
+                  ;   0xE0000..0xE00FF  >>12=  0xE0        -> 0x9
+                  (flet ((transform (x)
+                           (dpb
+                             (case (ash x -12)
+                               ((#x00 #x01 #x02 #x03) (ash x -12))
+                               (#x0A 4)
+                               (#x0F 5)
+                               (#x10 6)
+                               (#x1D 7)
+                               (#x2F 8)
+                               (#xE0 9)
+                               (t (error "Update the transform function for 0x~5,'0X" x))
+                             )
+                             (byte 8 12)
+                             x
+                        )) )
+                    (push (make-unicode-char :code (transform code)
+                                             :name name-string)
+                          all-chars
+            ) ) ) ) )
+    ) ) ) )
+    (setq all-chars (nreverse all-chars))
+    ;; Split into words.
+    (let ((words-by-length (make-array 0 :adjustable t)))
+      (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
+        (let ((i1 0))
+          (loop
+            (when (>= i1 (length name)) (return))
+            (let ((i2 (or (position #\Space name :start i1) (length name))))
+              (let* ((word (subseq name i1 i2))
+                     (len (length word)))
+                (when (>= len (length words-by-length))
+                  (adjust-array words-by-length (1+ len))
+                )
+                (unless (aref words-by-length len)
+                  (setf (aref words-by-length len)
+                        (make-word-list
+                          :hashed (make-hash-table :test #'equal)
+                          :sorted '()
+                ) )     )
+                (let ((word-list (aref words-by-length len)))
+                  (unless (gethash word (word-list-hashed word-list))
+                    (setf (gethash word (word-list-hashed word-list)) t)
+                    (push word (word-list-sorted word-list))
+                ) )
+              )
+              (setq i1 (1+ i2))
+      ) ) ) )
+      ;; Sort the word lists.
+      (dotimes (len (length words-by-length))
+        (unless (aref words-by-length len)
+          (setf (aref words-by-length len)
+                (make-word-list
+                  :hashed (make-hash-table :test #'equal)
+                  :sorted '()
+        ) )     )
+        (let ((word-list (aref words-by-length len)))
+          (setf (word-list-sorted word-list)
+                (sort (word-list-sorted word-list) #'string<)
+          )
+          (setf (word-list-size word-list)
+                (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
+          )
+          (setf (word-list-length word-list)
+                (length (word-list-sorted word-list))
+      ) ) )
+      ;; Output the tables.
+      (with-open-file (ostream outputfile :direction :output
+                       #+UNICODE :external-format #+UNICODE charset:ascii)
+        (format ostream "/*~%")
+        (format ostream " * ~A~%" (file-namestring outputfile))
+        (format ostream " *~%")
+        (format ostream " * Unicode character name table.~%")
+        (format ostream " * Generated automatically by the gen-uninames utility.~%")
+        (format ostream " */~%")
+        (format ostream "~%")
+        (format ostream "static const char unicode_name_words[~D] = {~%"
+                        (let ((sum 0))
+                          (dotimes (len (length words-by-length))
+                            (let ((word-list (aref words-by-length len)))
+                              (incf sum (word-list-size word-list))
+                          ) )
+                          sum
+        )               )
+        (dotimes (len (length words-by-length))
+          (let ((word-list (aref words-by-length len)))
+            (dolist (word (word-list-sorted word-list))
+              (format ostream " ~{ '~C',~}~%" (coerce word 'list))
+        ) ) )
+        (format ostream "};~%")
+        (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
+                        (let ((sum 0))
+                          (dotimes (len (length words-by-length))
+                            (let ((word-list (aref words-by-length len)))
+                              (incf sum (word-list-length word-list))
+                          ) )
+                          sum
+        )               )
+        #| ; Redundant data
+        (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
+                        (let ((sum 0))
+                          (dotimes (len (length words-by-length))
+                            (let ((word-list (aref words-by-length len)))
+                              (incf sum (word-list-length word-list))
+                          ) )
+                          sum
+        )               )
+        (dotimes (len (length words-by-length))
+          (let ((word-list (aref words-by-length len)))
+            (when (word-list-sorted word-list)
+              (format ostream " ")
+              (do ((l (word-list-sorted word-list) (cdr l))
+                   (offset 0 (+ offset (length (car l)))))
+                  ((endp l))
+                (format ostream "~<~% ~0,79:; ~D,~>" offset)
+              )
+              (format ostream "~%")
+        ) ) )
+        (format ostream "};~%")
+        |#
+        (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
+                        (1+ (length words-by-length))
+        )
+        (let ((extra-offset 0)
+              (ind-offset 0))
+          (dotimes (len (length words-by-length))
+            (let ((word-list (aref words-by-length len)))
+              (format ostream "  { ~D, ~D },~%" extra-offset ind-offset)
+              (incf extra-offset (word-list-size word-list))
+              (incf ind-offset (word-list-length word-list))
+          ) )
+          (format ostream "  { ~D, ~D }~%" extra-offset ind-offset)
+        )
+        (format ostream "};~%")
+        (let ((ind-offset 0))
+          (dotimes (len (length words-by-length))
+            (let ((word-list (aref words-by-length len)))
+              (dolist (word (word-list-sorted word-list))
+                (setf (gethash word (word-list-hashed word-list)) ind-offset)
+                (incf ind-offset)
+        ) ) ) )
+        (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
+          (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
+                          (gethash word (word-list-hashed (aref words-by-length (length word))))
+        ) )
+        ;; Compute the word-indices for every unicode-char.
+        (dolist (uc all-chars)
+          (let ((name (unicode-char-name uc))
+                (indices '()))
+            (let ((i1 0))
+              (loop
+                (when (>= i1 (length name)) (return))
+                (let ((i2 (or (position #\Space name :start i1) (length name))))
+                  (let* ((word (subseq name i1 i2))
+                         (len (length word)))
+                    (push (gethash word (word-list-hashed (aref words-by-length len)))
+                          indices
+                    )
+                  )
+                  (setq i1 (1+ i2))
+            ) ) )
+            (setf (unicode-char-word-indices uc)
+                  (coerce (nreverse indices) 'vector)
+            )
+        ) )
+        ;; Sort the list of unicode-chars by word-indices.
+        (setq all-chars
+              (sort all-chars
+                    (lambda (vec1 vec2)
+                      (let ((len1 (length vec1))
+                            (len2 (length vec2)))
+                        (do ((i 0 (1+ i)))
+                            (nil)
+                          (if (< i len2)
+                            (if (< i len1)
+                              (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
+                                    ((> (aref vec1 i) (aref vec2 i)) (return nil))
+                              )
+                              (return t)
+                            )
+                            (return nil)
+                    ) ) ) )
+                    :key #'unicode-char-word-indices
+        )     )
+        ;; Output the word-indices.
+        (format ostream "static const uint16_t unicode_names[~D] = {~%"
+                        (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+        )
+        (let ((i 0))
+          (dolist (uc all-chars)
+            (format ostream " ~{ ~D,~}"
+                            (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
+                                     (coerce (unicode-char-word-indices uc) 'list)
+                            )
+            )
+            (when add-comments
+              (format ostream "~40T/* ~A */" (unicode-char-name uc))
+            )
+            (format ostream "~%")
+            (setf (unicode-char-word-indices-index uc) i)
+            (incf i (length (unicode-char-word-indices uc)))
+        ) )
+        (format ostream "};~%")
+        (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[~D] = {~%"
+                        (length all-chars)
+        )
+        (dolist (uc all-chars)
+          (format ostream "  { 0x~4,'0X, ~D },"
+                          (unicode-char-code uc)
+                          (unicode-char-word-indices-index uc)
+          )
+          (when add-comments
+            (format ostream "~21T/* ~A */" (unicode-char-name uc))
+          )
+          (format ostream "~%")
+        )
+        (format ostream "};~%")
+        (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[~D] = {~%"
+                        (length all-chars)
+        )
+        (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
+          (format ostream "  { 0x~4,'0X, ~D },"
+                          (unicode-char-code uc)
+                          (unicode-char-word-indices-index uc)
+          )
+          (when add-comments
+            (format ostream "~21T/* ~A */" (unicode-char-name uc))
+          )
+          (format ostream "~%")
+        )
+        (format ostream "};~%")
+        (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
+                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
+        )
+        (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
+                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+        )
+      )
+) ) )
+
+(main (first *args*) (second *args*))
author	Bruno Haible <bruno@clisp.org>
	Wed, 6 Feb 2002 12:52:46 +0000 (12:52 +0000)
committer	Bruno Haible <bruno@clisp.org>
	Sun, 21 Jun 2009 23:16:57 +0000 (01:16 +0200)