diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2025-06-07 00:38:22 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2025-06-07 00:39:28 +0200 |
| commit | a60c0f58470f78545544d01d525eae481511abec (patch) | |
| tree | f06d8ace50fcf7f773db6e03d142081c3a9ac62b /server/common/src/grit.rs | |
| parent | ad893a4fb44244132d710d7f94fc99a7d83f1b87 (diff) | |
grit: Add basic parsing of xlf
This doesn't read the full xliff format, it only reads the translation
target string and the id for the translation unit.
Diffstat (limited to 'server/common/src/grit.rs')
| -rw-r--r-- | server/common/src/grit.rs | 377 |
1 files changed, 377 insertions, 0 deletions
diff --git a/server/common/src/grit.rs b/server/common/src/grit.rs index a510724..c0351c4 100644 --- a/server/common/src/grit.rs +++ b/server/common/src/grit.rs @@ -1,6 +1,7 @@ #![allow(dead_code)] use anyhow::Error; +use std::collections::VecDeque; use std::fs; use std::io::{BufReader, Read}; use std::path::Path; @@ -123,6 +124,20 @@ pub struct GritPart { pub messages: Vec<IfMessage>, } +#[derive(Debug, PartialEq)] +pub struct TranslationFile { + pub target_language: String, + + pub units: Vec<TranslationUnit>, +} + +#[derive(Debug, PartialEq)] +pub struct TranslationUnit { + pub id: i64, + + pub target: Vec<TextPlaceholder>, +} + fn get_opt_attribute<'a>(attributes: &'a Vec<OwnedAttribute>, name: &str) -> Option<&'a str> { for attribute in attributes { if attribute.name.local_name == name { @@ -1189,3 +1204,365 @@ pub fn get_message_id(message: &Message) -> i64 { // Avoid returning negative ids message_id & 0x7fffffffffffffff } + +fn parse_translation_unit_target_element<R: Read>( + _attributes: &Vec<OwnedAttribute>, + reader: &mut EventReader<R>, +) -> anyhow::Result<Vec<TextPlaceholder>> { + let mut content = Vec::<TextPlaceholder>::new(); + let mut first = true; + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "ph" => { + first = false; + content.push(parse_translation_placeholder_element(&attributes, reader)?); + } + _ => { + return Err(Error::msg(format!( + "Unexpected {0} in file", + name.local_name + ))); + } + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "target"); + break; + } + XmlEvent::Characters(data) => content.push(TextPlaceholder::Text(if first { + first = false; + data.trim_start().to_string() + } else { + data + })), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + if !first { + match content.last_mut().unwrap() { + TextPlaceholder::Text(data) => { + data.truncate(data.trim_end().len()); + if data.is_empty() { + content.pop(); + } + } + TextPlaceholder::Placeholder { + name: _, + content: _, + example: _, + } => {} + } + } + + Ok(content) +} + +fn parse_translation_placeholder_element<R: Read>( + attributes: &Vec<OwnedAttribute>, + reader: &mut EventReader<R>, +) -> anyhow::Result<TextPlaceholder> { + let id = get_attribute(attributes, "id")?; + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes: _, + namespace: _, + } => { + return Err(Error::msg(format!("Unexpected {0} in ph", name.local_name))); + } + XmlEvent::EndElement { name } => { + assert!(name.local_name == "ph"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + Ok(TextPlaceholder::Placeholder { + name: id.to_string(), + content: String::new(), + example: None, + }) +} + +fn parse_translation_unit_element<R: Read>( + attributes: &Vec<OwnedAttribute>, + reader: &mut EventReader<R>, +) -> anyhow::Result<TranslationUnit> { + let id = get_attribute(attributes, "id")?.parse::<i64>()?; + + let mut target: Option<Vec<TextPlaceholder>> = None; + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "target" => { + if target.is_some() { + return Err(Error::msg("Two target in trans-unit")); + } + target = Some(parse_translation_unit_target_element(&attributes, reader)?); + } + _ => { + reader.skip()?; + } + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "trans-unit"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + Ok(TranslationUnit { + id, + target: target.expect("No target in trans-unit"), + }) +} + +fn parse_translation_body_element<R: Read>( + _attributes: &Vec<OwnedAttribute>, + reader: &mut EventReader<R>, +) -> anyhow::Result<Vec<TranslationUnit>> { + let mut units = Vec::<TranslationUnit>::new(); + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "trans-unit" => { + units.push(parse_translation_unit_element(&attributes, reader)?); + } + _ => { + reader.skip()?; + } + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "body"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + Ok(units) +} + +fn parse_translation_file_element<R: Read>( + attributes: &Vec<OwnedAttribute>, + reader: &mut EventReader<R>, +) -> anyhow::Result<TranslationFile> { + let target_language = get_attribute(attributes, "target-language")?; + + let mut units: Option<Vec<TranslationUnit>> = None; + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "body" => { + if units.is_some() { + return Err(Error::msg("More than one body in file")); + } + units = Some(parse_translation_body_element(&attributes, reader)?); + } + _ => { + reader.skip()?; + } + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "file"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + Ok(TranslationFile { + target_language: target_language.to_string(), + units: units.expect("body element in file"), + }) +} + +fn parse_xliff_element<R: Read>( + _attributes: &Vec<OwnedAttribute>, + reader: &mut EventReader<R>, +) -> anyhow::Result<TranslationFile> { + let mut file = VecDeque::<TranslationFile>::new(); + + loop { + let event = reader.next()?; + match event { + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => match name.local_name.as_str() { + "file" => { + file.push_back(parse_translation_file_element(&attributes, reader)?); + } + _ => (), + }, + XmlEvent::EndElement { name } => { + assert!(name.local_name == "xliff"); + break; + } + XmlEvent::Characters(_) => (), + + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::EndDocument => panic!("Unexpected EOD"), + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + + if file.is_empty() { + Err(Error::msg("No file in xliff")) + } else if file.len() == 1 { + Ok(file.pop_front().unwrap()) + } else { + let mut ret = file.pop_front().unwrap(); + while !file.is_empty() { + let other = file.pop_front().unwrap(); + if other.target_language == ret.target_language { + let end = ret.units.len(); + ret.units.splice(end..end, other.units); + } else { + return Err(Error::msg( + "Multiple translations in the same file, not supported yet", + )); + } + } + + Ok(ret) + } +} + +pub async fn parse_xlf(path: impl AsRef<Path>) -> anyhow::Result<TranslationFile> { + let path = path.as_ref().to_path_buf(); + spawn_blocking(move || { + let file = fs::File::open(path)?; + let reader = BufReader::new(file); + let mut ereader = ParserConfig::new() + .ignore_comments(true) + .whitespace_to_characters(true) + .cdata_to_characters(true) + .create_reader(reader); + let mut ret: Option<TranslationFile> = None; + loop { + let event = ereader.next()?; + match event { + XmlEvent::StartDocument { + version: _, + encoding: _, + standalone: _, + } => (), + XmlEvent::StartElement { + name, + attributes, + namespace: _, + } => { + if name.local_name == "xliff" { + ret = Some(parse_xliff_element(&attributes, &mut ereader)?); + } else { + return Err(Error::msg("Document root != xliff")); + } + } + XmlEvent::EndDocument => break, + XmlEvent::EndElement { name: _ } => panic!("Unexpected EoE"), + XmlEvent::Characters(_) => (), + + XmlEvent::ProcessingInstruction { name: _, data: _ } => (), + XmlEvent::CData(_) => (), + XmlEvent::Comment(_) => (), + XmlEvent::Whitespace(_) => (), + } + } + Ok(ret.unwrap()) + }) + .await + .unwrap() +} |
