From: Simon Cozens Date: Fri, 8 Nov 2024 15:52:12 +0000 (+0000) Subject: Rust crate (#158) X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a8fa72c1c58668a51aeba347110f96604aa85d89;p=thirdparty%2Fgoogle%2Ffonts.git Rust crate (#158) * Initial commit * Reduce chunk size to 100, to avoid pathological compile times * rustfmt * Loosen deps * Tests, kind of * Rename crate * Use serde_json to store the data --- diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000000..90cf6dddeb --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "google-fonts-languages" +version = "0.6.3" +edition = "2021" + +[dependencies] +bytes = "1.7.1" +prost = "0.13" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" + +[build-dependencies] +prost-build = "0.13" +protobuf-support = { git = "https://github.com/cmyr/rust-protobuf", branch = "parse-unicode-strings" } +protobuf = { git = "https://github.com/cmyr/rust-protobuf", branch = "parse-unicode-strings" } +protobuf-parse = { git = "https://github.com/cmyr/rust-protobuf", branch = "parse-unicode-strings" } +glob = "*" # This is a joke. +prettyplease = "0.2" +quote = "1.0" +proc-macro2 = "1.0" +syn = "2.0" +itertools = "0.13" +serde_json = "1.0" diff --git a/build.rs b/build.rs new file mode 100644 index 0000000000..c5e4c45c4b --- /dev/null +++ b/build.rs @@ -0,0 +1,168 @@ +use proc_macro2::TokenStream; +use protobuf::reflect::{FieldDescriptor, ReflectValueRef}; +use quote::quote; +use serde_json::Map; +use std::io::{BufWriter, Write}; +use std::{env, fs::File, path::Path}; + +fn main() { + // First we load up the descriptor using the protobuf crate + // so that we can do reflection on it. + let descriptors = protobuf_parse::Parser::new() + .pure() + .include(".") + .input("Lib/gflanguages/languages_public.proto") + .file_descriptor_set() + .expect("Could not parse languages_public.proto"); + let protofile = descriptors.file.first().expect("No file in descriptor"); + let descriptor = protobuf::reflect::FileDescriptor::new_dynamic(protofile.clone(), &[]) + .expect("Could not create descriptor"); + + // Now we use the prost crate to compile them, so that we can + // generate Rust structs. + let mut config = prost_build::Config::new(); + // config.boxed(".google.languages_public.LanguageProto.sample_text"); + // config.boxed(".google.languages_public.LanguageProto.exemplar_chars"); + + // The reflection can tell us what messages we have, so we can configure + // them to be deserializable with serde + for message in descriptor.messages() { + config.type_attribute( + message.full_name(), + "#[derive(serde::Serialize, serde::Deserialize)]", + ); + } + // Let's make our structs; this produces google.languages_public.rs + config + .compile_protos( + &["Lib/gflanguages/languages_public.proto"], + &["Lib/gflanguages/"], + ) + .expect("Could not compile languages_public.proto"); + + let path = Path::new(&env::var("OUT_DIR").unwrap()).join("data.rs"); + let mut file = BufWriter::new(File::create(path).unwrap()); + let mut output = quote! { use std::collections::BTreeMap; use std::sync::LazyLock; }; + + output.extend(serialize_a_structure( + ".google.languages_public.RegionProto", + "Lib/gflanguages/data/regions/*.textproto", + "REGIONS", + &descriptor, + )); + + output.extend(serialize_a_structure( + ".google.languages_public.ScriptProto", + "Lib/gflanguages/data/scripts/*.textproto", + "SCRIPTS", + &descriptor, + )); + + output.extend(serialize_a_structure( + ".google.languages_public.LanguageProto", + "Lib/gflanguages/data/languages/*.textproto", + "LANGUAGES", + &descriptor, + )); + // file.write_all(output.to_string().as_bytes()) + // .expect("Could not write to file"); + + let abstract_file: syn::File = syn::parse2(output).expect("Could not parse output"); + let formatted = prettyplease::unparse(&abstract_file); + file.write_all(formatted.as_bytes()) + .expect("Could not write to file"); +} + +fn serialize_a_structure( + proto_name: &str, + pathglob: &str, + output_variable: &str, + descriptor: &protobuf::reflect::FileDescriptor, +) -> TokenStream { + let proto = descriptor + .message_by_full_name(proto_name) + .unwrap_or_else(|| panic!("No {} message", proto_name)); + let files: Vec = glob::glob(pathglob) + .expect("Failed to read glob pattern") + .flatten() + .collect(); + let name: TokenStream = proto.name().parse().unwrap(); + let variable: TokenStream = output_variable.parse().unwrap(); + let mut map = Map::new(); + for file in files.into_iter() { + serialize_file(file, &proto, &mut map); + } + let json_var: TokenStream = format!("__{}", output_variable).parse().unwrap(); + let docmsg = format!("A map of all the {} objects", name); + let json_dump = serde_json::to_string(&map).expect("Could not serialize"); + quote! { + static #json_var: &str = #json_dump; + + #[doc = #docmsg] + pub static #variable: LazyLock>> = LazyLock::new(|| { + serde_json::from_str(#json_var).expect("Could not deserialize") + }); + } +} +fn serialize_file( + path: std::path::PathBuf, + descriptor: &protobuf::reflect::MessageDescriptor, + value: &mut Map, +) { + let mut message = descriptor.new_instance(); + let message_mut = message.as_mut(); + let input = std::fs::read_to_string(&path).expect("Could not read file"); + protobuf::text_format::merge_from_str(message_mut, &input) + .unwrap_or_else(|e| panic!("Could not parse file {:?}: {:?}", path, e)); + let id = path.file_stem().unwrap().to_str().unwrap(); + value.insert(id.to_string(), serialize_message(message_mut)); +} + +fn serialize_message(message: &dyn protobuf::MessageDyn) -> serde_json::Value { + let descriptor = message.descriptor_dyn(); + // let descriptor_name: TokenStream = descriptor.name().parse().unwrap(); + let mut output = Map::new(); + for field in descriptor.fields() { + let field_name: TokenStream = field.name().parse().unwrap(); + let field_contents = serialize_field(&field, message); + output.insert(field_name.to_string(), field_contents); + } + output.into() +} + +fn serialize_field( + field: &FieldDescriptor, + message: &dyn protobuf::MessageDyn, +) -> serde_json::Value { + if field.is_repeated() { + let v: Vec = field + .get_repeated(message) + .into_iter() + .map(|value| serialize_field_value(field, value)) + .collect(); + v.into() + } else if field.is_required() { + serialize_field_value(field, field.get_singular(message).unwrap()) + } else if field.has_field(message) { + let value = serialize_field_value(field, field.get_singular(message).unwrap()); + value.into() + } else { + serde_json::Value::Null + } +} + +fn serialize_field_value(_field: &FieldDescriptor, value: ReflectValueRef) -> serde_json::Value { + match value { + ReflectValueRef::Bool(value) => value.into(), + ReflectValueRef::I32(value) => value.into(), + ReflectValueRef::I64(value) => value.into(), + ReflectValueRef::U32(value) => value.into(), + ReflectValueRef::U64(value) => value.into(), + ReflectValueRef::F32(value) => value.into(), + ReflectValueRef::F64(value) => value.into(), + ReflectValueRef::String(value) => value.into(), + ReflectValueRef::Bytes(value) => value.into(), + ReflectValueRef::Enum(_value, _ix) => unimplemented!(), + ReflectValueRef::Message(value) => serialize_message(&*value), + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000000..97a4c6d6c2 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,25 @@ +include!(concat!(env!("OUT_DIR"), "/google.languages_public.rs")); +include!(concat!(env!("OUT_DIR"), "/data.rs")); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn regions() { + assert!((*REGIONS).contains_key("BG")); + assert_eq!(REGIONS.get("BG").unwrap().name.as_deref(), Some("Bulgaria")); + } + + #[test] + fn scripts() { + assert!((*SCRIPTS).contains_key("Arab")); + assert_eq!(SCRIPTS.get("Arab").unwrap().name.as_deref(), Some("Arabic")); + } + + #[test] + fn languages() { + assert!(LANGUAGES.len() > 1000); + assert!((*LANGUAGES).contains_key("ar_Arab")); + } +}