From 80848d08023e92f64e25391d2740468e0a47a9cf Mon Sep 17 00:00:00 2001 From: Leonora Tindall Date: Mon, 21 Oct 2019 08:26:35 -0500 Subject: [PATCH] Add --buffered and same stream check Before this, running the program with the same input and output file would destroy the file's contents. Now, it isn't allowed without the new --buffered option, which reads the whole file into memory before opening it for writing. This requires stevedonovan/lapp#1 to be merged so for now this version is not publishable to crates.io as it is using my fork of lapp. --- Cargo.lock | 8 ++++---- Cargo.toml | 4 ++-- README | 13 +++++++++++-- src/main.rs | 29 ++++++++++++++++++++++++----- 4 files changed, 41 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cd979be..2bf839d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3,7 +3,7 @@ [[package]] name = "lapp" version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" +source = "git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths#0fb2df69ce5262958401214fca0d0f31d62b1062" [[package]] name = "smallvec" @@ -20,13 +20,13 @@ dependencies = [ [[package]] name = "utf8-norm" -version = "1.0.1" +version = "1.1.0" dependencies = [ - "lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)", "unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", ] [metadata] -"checksum lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60bf485afeba9437a275ad29a9383b03f2978450e7feceffb55be8c0dbad9829" +"checksum lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)" = "" "checksum smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab606a9c5e214920bb66c458cd7be8ef094f813f20fe77a54cc7dbfff220d4b7" "checksum unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "141339a08b982d942be2ca06ff8b076563cbe223d1befd5450716790d44e2426" diff --git a/Cargo.toml b/Cargo.toml index c22ce9e..5e2b094 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "utf8-norm" -version = "1.0.1" +version = "1.1.0" authors = ["Leonora Tindall "] edition = "2018" license = "GPL-3.0-only" @@ -12,6 +12,6 @@ keywords = ["unicode", "normalize"] categories = ["command-line-utilities", "internationalization", "localization", "text-processing"] [dependencies] -lapp = "0.4" +lapp = { git = "https://github.com/NoraCodes/lapp", branch = "nora/iofile_paths" } unicode-normalization = "0.1" diff --git a/README b/README index 44b3eae..5d345b6 100644 --- a/README +++ b/README @@ -1,9 +1,13 @@ utf8-norm, validate and normalize UTF-8 Unicode data -Version 1.0.1 licensed GPLv3. (C) 2019 Leonora Tindall +ABOUT + +Version 1.1.0 licensed GPLv3. (C) 2019 Leonora Tindall Fast command line Unicode normalization, supporting stream safety transformations as well as NFC, NFD, NFKD, and NFKC. Exits with failure if the incoming stream is not valid UTF-8. +USAGE + Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] (default stdin) - file from which to read bytes. @@ -14,7 +18,12 @@ Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] ", @@ -25,6 +23,11 @@ fn main() { std::process::exit(0); } + if !args.get_bool("buffered") && args.get_path("infile") == args.get_path("outfile") { + eprintln!("Warning: input and output file are the same. This is not supported without using --buffered, because it would result in removing the file without processing any input."); + std::process::exit(128); + } + let nfd = args.get_bool("nfd"); let nfkd = args.get_bool("nfkd"); let nfc = args.get_bool("nfc"); @@ -33,10 +36,26 @@ fn main() { if trues(&[nfd, nfkd, nfc, nfkc]) > 1 { eprintln!("--nfd, --nfkd, --nfc, and --nfkc are mutually exclusive."); - std::process::exit(1); + std::process::exit(128); } - for line in infile.lines() { + // If buffering is needed, we have to read the whole file BEFORE opening it for + // writing, or else it will be truncated and all the contents will be lost. + let instream: Box = { + if args.get_bool("buffered") { + let mut buffer = Vec::new(); + args.get_infile("infile") + .read_to_end(&mut buffer) + .expect("Could not read input file into buffer. Error"); + Box::new(Cursor::new(buffer)) + } else { + Box::new(BufReader::new(args.get_infile("infile"))) + } + }; + + let mut outfile = BufWriter::new(args.get_outfile("outfile")); + + for line in instream.lines() { let mut line = line.expect("Could not read line from file. Error").clone(); if args.get_bool("crlf") { line.push('\x0D');