commit c44f33a056b52a75c8bae67a42c82f8d05770a1b Author: Leonora Tindall Date: Sat Oct 19 15:50:52 2019 -0400 First commit - working version from RBR diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..53eaa21 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +**/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..ac78c40 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,32 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "lapp" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "smallvec" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unicode-normalization" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "utf8-norm" +version = "1.0.0" +dependencies = [ + "lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[metadata] +"checksum lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60bf485afeba9437a275ad29a9383b03f2978450e7feceffb55be8c0dbad9829" +"checksum smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab606a9c5e214920bb66c458cd7be8ef094f813f20fe77a54cc7dbfff220d4b7" +"checksum unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "141339a08b982d942be2ca06ff8b076563cbe223d1befd5450716790d44e2426" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..4070008 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "utf8-norm" +version = "1.0.0" +authors = ["Leonora Tindall "] +edition = "2018" +license = "GPL-3.0-only" +description = "Command line tool to validate and normalize UTF-8 data" +readme = "README" +homepage = "https://nora.codes/projects/utf8-norm" +repository = "https://git.nora.codes/nora/utf-norm" +keywords = ["unicode", "normalize"] +categories = ["command-line=utilities", "internationalization", "localization", "text-processing"] + +[dependencies] +lapp = "0.4" +unicode-normalization = "0.1" + diff --git a/README b/README new file mode 100644 index 0000000..bcd52cb --- /dev/null +++ b/README @@ -0,0 +1,23 @@ +utf8-norm, validate and normalize UTF-8 Unicode data + +Version 1.0.0 licensed GPLv3. (C) 2019 Leonora Tindall +Fast command line Unicode normalization, supporting stream safety transformations as well +as NFC, NFD, NFKD, and NFKC. Exits with failure if the incoming stream is not valid UTF-8. + +Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] + + (default stdin) - file from which to read bytes. + (default stdout) - file to which to write normalized Unicode. + -w, --crlf - write CRLF (Windows) instead of LF only (Unix) at the end of lines. + -d, --nfd - write NFD (canonical decomposition). + -D, --nfkd - write NFKD (compatibility decomposition). + -c, --nfc - write NFC (canonical composition computed from NFD). This is the default. + -C, --nfkc - write NFKC (canonical composition computed from NFC). + -s, --stream-safe - write stream-safe bytes (Conjoining Grapheme Joiners, UAX15-D4). + -V, --version - output version information and exit + +utf8-norm was created at Rust Belt Rust 2019 in Dayton, OH. Thanks to @j41manning for her +excellent talk regarding Unicode handling in Rust. + +Natively install as `cargo install utf8-norm` or from your distribution's package manager. + diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..5c6c20a --- /dev/null +++ b/src/main.rs @@ -0,0 +1,66 @@ +extern crate lapp; +extern crate unicode_normalization; +use std::io::{BufRead, BufReader, BufWriter, Write}; +use unicode_normalization::UnicodeNormalization; + +// usize because that's the max number of bools in the slice +fn trues(bools: &[bool]) -> usize { + let mut n = 0; + for b in bools { + if *b { n += 1 } + } + n +} + +fn main() { + let usage = include_str!("../README"); + let args = lapp::parse_args(&usage); + + let infile = BufReader::new(args.get_infile("infile")); + let mut outfile = BufWriter::new(args.get_outfile("outfile")); + + if args.get_bool("version") { + println!("Version {} licensed GPLv3. (C) 2019 Leonora Tindall ", + env!("CARGO_PKG_VERSION")); + std::process::exit(0); + } + + let nfd = args.get_bool("nfd"); + let nfkd = args.get_bool("nfkd"); + let nfc = args.get_bool("nfc"); + let nfkc = args.get_bool("nfkc"); + let ss = args.get_bool("stream-safe"); + + if trues(&[nfd, nfkd, nfc, nfkc]) > 1 { + eprintln!("--nfd, --nfkd, --nfc, and --nfkc are mutually exclusive."); + std::process::exit(1); + } + + for line in infile.lines() { + let mut line = line.expect("Could not read line from file. Error").clone(); + if args.get_bool("crlf") { + line.push('\x0D'); + } + line.push('\x0A'); + + let normalized: Box>; + if nfd { + normalized = Box::new(line.chars().nfd()); + } else if nfkc { + normalized = Box::new(line.chars().nfkc()); + } else if nfkd { + normalized = Box::new(line.chars().nfkd()); + } else { + normalized = Box::new(line.chars().nfc()); + } + + let output: String; + if ss { + output = normalized.stream_safe().collect(); + } else { + output = normalized.collect(); + } + + write!(&mut outfile, "{}", output).expect("Could not write to output. Error"); + } +}