diff --git a/Cargo.lock b/Cargo.lock index cd979be..2bf839d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3,7 +3,7 @@ [[package]] name = "lapp" version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" +source = "git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths#0fb2df69ce5262958401214fca0d0f31d62b1062" [[package]] name = "smallvec" @@ -20,13 +20,13 @@ dependencies = [ [[package]] name = "utf8-norm" -version = "1.0.1" +version = "1.1.0" dependencies = [ - "lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)", "unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", ] [metadata] -"checksum lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60bf485afeba9437a275ad29a9383b03f2978450e7feceffb55be8c0dbad9829" +"checksum lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)" = "" "checksum smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab606a9c5e214920bb66c458cd7be8ef094f813f20fe77a54cc7dbfff220d4b7" "checksum unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "141339a08b982d942be2ca06ff8b076563cbe223d1befd5450716790d44e2426" diff --git a/Cargo.toml b/Cargo.toml index c22ce9e..5e2b094 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "utf8-norm" -version = "1.0.1" +version = "1.1.0" authors = ["Leonora Tindall "] edition = "2018" license = "GPL-3.0-only" @@ -12,6 +12,6 @@ keywords = ["unicode", "normalize"] categories = ["command-line-utilities", "internationalization", "localization", "text-processing"] [dependencies] -lapp = "0.4" +lapp = { git = "https://github.com/NoraCodes/lapp", branch = "nora/iofile_paths" } unicode-normalization = "0.1" diff --git a/README b/README index 44b3eae..5d345b6 100644 --- a/README +++ b/README @@ -1,9 +1,13 @@ utf8-norm, validate and normalize UTF-8 Unicode data -Version 1.0.1 licensed GPLv3. (C) 2019 Leonora Tindall +ABOUT + +Version 1.1.0 licensed GPLv3. (C) 2019 Leonora Tindall Fast command line Unicode normalization, supporting stream safety transformations as well as NFC, NFD, NFKD, and NFKC. Exits with failure if the incoming stream is not valid UTF-8. +USAGE + Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] (default stdin) - file from which to read bytes. @@ -14,7 +18,12 @@ Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] ", @@ -25,6 +23,11 @@ fn main() { std::process::exit(0); } + if !args.get_bool("buffered") && args.get_path("infile") == args.get_path("outfile") { + eprintln!("Warning: input and output file are the same. This is not supported without using --buffered, because it would result in removing the file without processing any input."); + std::process::exit(128); + } + let nfd = args.get_bool("nfd"); let nfkd = args.get_bool("nfkd"); let nfc = args.get_bool("nfc"); @@ -33,10 +36,26 @@ fn main() { if trues(&[nfd, nfkd, nfc, nfkc]) > 1 { eprintln!("--nfd, --nfkd, --nfc, and --nfkc are mutually exclusive."); - std::process::exit(1); + std::process::exit(128); } - for line in infile.lines() { + // If buffering is needed, we have to read the whole file BEFORE opening it for + // writing, or else it will be truncated and all the contents will be lost. + let instream: Box = { + if args.get_bool("buffered") { + let mut buffer = Vec::new(); + args.get_infile("infile") + .read_to_end(&mut buffer) + .expect("Could not read input file into buffer. Error"); + Box::new(Cursor::new(buffer)) + } else { + Box::new(BufReader::new(args.get_infile("infile"))) + } + }; + + let mut outfile = BufWriter::new(args.get_outfile("outfile")); + + for line in instream.lines() { let mut line = line.expect("Could not read line from file. Error").clone(); if args.get_bool("crlf") { line.push('\x0D');