From a60c0f58470f78545544d01d525eae481511abec Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Sat, 7 Jun 2025 00:38:22 +0200 Subject: grit: Add basic parsing of xlf This doesn't read the full xliff format, it only reads the translation target string and the id for the translation unit. --- server/common/src/grit.rs | 377 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 377 insertions(+) (limited to 'server/common/src/grit.rs') diff --git a/server/common/src/grit.rs b/server/common/src/grit.rs index a510724..c0351c4 100644 --- a/server/common/src/grit.rs +++ b/server/common/src/grit.rs @@ -1,6 +1,7 @@ #![allow(dead_code)] use anyhow::Error; +use std::collections::VecDeque; use std::fs; use std::io::{BufReader, Read}; use std::path::Path; @@ -123,6 +124,20 @@ pub struct GritPart { pub messages: Vec, } +#[derive(Debug, PartialEq)] +pub struct TranslationFile { + pub target_language: String, + + pub units: Vec, +} + +#[derive(Debug, PartialEq)] +pub struct TranslationUnit { + pub id: i64, + + pub target: Vec, +} + fn get_opt_attribute<'a>(attributes: &'a Vec, name: &str) -> Option<&'a str> { for attribute in attributes { if attribute.name.local_name == name { @@ -1189,3 +1204,365 @@ pub fn get_message_id(message: &Message) -> i64 { // Avoid returning negative ids message_id & 0x7fffffffffffffff } + +fn parse_translation_unit_target_element( + _attributes: &Vec, + reader: &mut EventReader, +) -> anyhow::Result> { + let mut content = Vec::::new(); + let mut first = true; + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "ph" => { + first = false; + content.push(parse_translation_placeholder_element(&attributes, reader)?); + } + _ => { + return Err(Error::msg(format!( + "Unexpected {0} in file", + name.local_name + ))); + } + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "target"); + break; + } + XmlEvent::Characters(data) => content.push(TextPlaceholder::Text(if first { + first = false; + data.trim_start().to_string() + } else { + data + })), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + if !first { + match content.last_mut().unwrap() { + TextPlaceholder::Text(data) => { + data.truncate(data.trim_end().len()); + if data.is_empty() { + content.pop(); + } + } + TextPlaceholder::Placeholder { + name: _, + content: _, + example: _, + } => {} + } + } + + Ok(content) +} + +fn parse_translation_placeholder_element( + attributes: &Vec, + reader: &mut EventReader, +) -> anyhow::Result { + let id = get_attribute(attributes, "id")?; + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes: _, + namespace: _, + } => { + return Err(Error::msg(format!("Unexpected {0} in ph", name.local_name))); + } + XmlEvent::EndElement { name } => { + assert!(name.local_name == "ph"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + Ok(TextPlaceholder::Placeholder { + name: id.to_string(), + content: String::new(), + example: None, + }) +} + +fn parse_translation_unit_element( + attributes: &Vec, + reader: &mut EventReader, +) -> anyhow::Result { + let id = get_attribute(attributes, "id")?.parse::()?; + + let mut target: Option> = None; + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "target" => { + if target.is_some() { + return Err(Error::msg("Two target in trans-unit")); + } + target = Some(parse_translation_unit_target_element(&attributes, reader)?); + } + _ => { + reader.skip()?; + } + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "trans-unit"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + Ok(TranslationUnit { + id, + target: target.expect("No target in trans-unit"), + }) +} + +fn parse_translation_body_element( + _attributes: &Vec, + reader: &mut EventReader, +) -> anyhow::Result> { + let mut units = Vec::::new(); + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "trans-unit" => { + units.push(parse_translation_unit_element(&attributes, reader)?); + } + _ => { + reader.skip()?; + } + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "body"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + Ok(units) +} + +fn parse_translation_file_element( + attributes: &Vec, + reader: &mut EventReader, +) -> anyhow::Result { + let target_language = get_attribute(attributes, "target-language")?; + + let mut units: Option> = None; + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "body" => { + if units.is_some() { + return Err(Error::msg("More than one body in file")); + } + units = Some(parse_translation_body_element(&attributes, reader)?); + } + _ => { + reader.skip()?; + } + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "file"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + Ok(TranslationFile { + target_language: target_language.to_string(), + units: units.expect("body element in file"), + }) +} + +fn parse_xliff_element( + _attributes: &Vec, + reader: &mut EventReader, +) -> anyhow::Result { + let mut file = VecDeque::::new(); + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "file" => { + file.push_back(parse_translation_file_element(&attributes, reader)?); + } + _ => (), + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "xliff"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + if file.is_empty() { + Err(Error::msg("No file in xliff")) + } else if file.len() == 1 { + Ok(file.pop_front().unwrap()) + } else { + let mut ret = file.pop_front().unwrap(); + while !file.is_empty() { + let other = file.pop_front().unwrap(); + if other.target_language == ret.target_language { + let end = ret.units.len(); + ret.units.splice(end..end, other.units); + } else { + return Err(Error::msg( + "Multiple translations in the same file, not supported yet", + )); + } + } + + Ok(ret) + } +} + +pub async fn parse_xlf(path: impl AsRef) -> anyhow::Result { + let path = path.as_ref().to_path_buf(); + spawn_blocking(move || { + let file = fs::File::open(path)?; + let reader = BufReader::new(file); + let mut ereader = ParserConfig::new() + .ignore_comments(true) + .whitespace_to_characters(true) + .cdata_to_characters(true) + .create_reader(reader); + let mut ret: Option = None; + loop { + let event = ereader.next()?; + match event { + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => { + if name.local_name == "xliff" { + ret = Some(parse_xliff_element(&attributes, &mut ereader)?); + } else { + return Err(Error::msg("Document root != xliff")); + } + } + XmlEvent::EndDocument => break, + XmlEvent::EndElement { name: _ } => panic!("Unexpected EoE"), + XmlEvent::Characters(_) => (), + + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + Ok(ret.unwrap()) + }) + .await + .unwrap() +} -- cgit v1.2.3-70-g09d2