Add --buffered and same stream check

Before this, running the program with the same input and output file
would destroy the file's contents. Now, it isn't allowed without the new
--buffered option, which reads the whole file into memory before opening
it for writing.

This requires stevedonovan/lapp#1 to be merged so for now this version
is not publishable to crates.io as it is using my fork of lapp.
This commit is contained in:
Leonora Tindall 2019-10-21 08:26:35 -05:00
parent 234678befe
commit 80848d0802
Signed by: nora
GPG Key ID: 99041B68DBC02DAC
4 changed files with 41 additions and 13 deletions

8
Cargo.lock generated
View File

@ -3,7 +3,7 @@
[[package]] [[package]]
name = "lapp" name = "lapp"
version = "0.4.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths#0fb2df69ce5262958401214fca0d0f31d62b1062"
[[package]] [[package]]
name = "smallvec" name = "smallvec"
@ -20,13 +20,13 @@ dependencies = [
[[package]] [[package]]
name = "utf8-norm" name = "utf8-norm"
version = "1.0.1" version = "1.1.0"
dependencies = [ dependencies = [
"lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)",
"unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[metadata] [metadata]
"checksum lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60bf485afeba9437a275ad29a9383b03f2978450e7feceffb55be8c0dbad9829" "checksum lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)" = "<none>"
"checksum smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab606a9c5e214920bb66c458cd7be8ef094f813f20fe77a54cc7dbfff220d4b7" "checksum smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab606a9c5e214920bb66c458cd7be8ef094f813f20fe77a54cc7dbfff220d4b7"
"checksum unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "141339a08b982d942be2ca06ff8b076563cbe223d1befd5450716790d44e2426" "checksum unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "141339a08b982d942be2ca06ff8b076563cbe223d1befd5450716790d44e2426"

View File

@ -1,6 +1,6 @@
[package] [package]
name = "utf8-norm" name = "utf8-norm"
version = "1.0.1" version = "1.1.0"
authors = ["Leonora Tindall <nora@nora.codes>"] authors = ["Leonora Tindall <nora@nora.codes>"]
edition = "2018" edition = "2018"
license = "GPL-3.0-only" license = "GPL-3.0-only"
@ -12,6 +12,6 @@ keywords = ["unicode", "normalize"]
categories = ["command-line-utilities", "internationalization", "localization", "text-processing"] categories = ["command-line-utilities", "internationalization", "localization", "text-processing"]
[dependencies] [dependencies]
lapp = "0.4" lapp = { git = "https://github.com/NoraCodes/lapp", branch = "nora/iofile_paths" }
unicode-normalization = "0.1" unicode-normalization = "0.1"

13
README
View File

@ -1,9 +1,13 @@
utf8-norm, validate and normalize UTF-8 Unicode data utf8-norm, validate and normalize UTF-8 Unicode data
Version 1.0.1 licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes> ABOUT
Version 1.1.0 licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>
Fast command line Unicode normalization, supporting stream safety transformations as well Fast command line Unicode normalization, supporting stream safety transformations as well
as NFC, NFD, NFKD, and NFKC. Exits with failure if the incoming stream is not valid UTF-8. as NFC, NFD, NFKD, and NFKC. Exits with failure if the incoming stream is not valid UTF-8.
USAGE
Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] <infile> <outfile> Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] <infile> <outfile>
<infile> (default stdin) - file from which to read bytes. <infile> (default stdin) - file from which to read bytes.
@ -14,7 +18,12 @@ Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] <inf
-c, --nfc - write NFC (canonical composition computed from NFD). This is the default. -c, --nfc - write NFC (canonical composition computed from NFD). This is the default.
-C, --nfkc - write NFKC (canonical composition computed from NFC). -C, --nfkc - write NFKC (canonical composition computed from NFC).
-s, --stream-safe - write stream-safe bytes (Conjoining Grapheme Joiners, UAX15-D4). -s, --stream-safe - write stream-safe bytes (Conjoining Grapheme Joiners, UAX15-D4).
-V, --version - output version information and exit -b, --buffered - read the entire input file into memory before operating on it.
-V, --version - output version information and exit.
The --buffered option is primarily useful for reading and writing to the same file. It will
read bytes from the input until end of file and only then begin processing lines of the
input.
utf8-norm was created at Rust Belt Rust 2019 in Dayton, OH. Thanks to @j41manning for her utf8-norm was created at Rust Belt Rust 2019 in Dayton, OH. Thanks to @j41manning for her
excellent talk regarding Unicode handling in Rust. excellent talk regarding Unicode handling in Rust.

View File

@ -1,6 +1,6 @@
extern crate lapp; extern crate lapp;
extern crate unicode_normalization; extern crate unicode_normalization;
use std::io::{BufRead, BufReader, BufWriter, Write}; use std::io::{BufRead, BufReader, BufWriter, Write, Cursor};
use unicode_normalization::UnicodeNormalization; use unicode_normalization::UnicodeNormalization;
// usize because that's the max number of bools in the slice // usize because that's the max number of bools in the slice
@ -16,8 +16,6 @@ fn main() {
let usage = include_str!("../README"); let usage = include_str!("../README");
let args = lapp::parse_args(&usage); let args = lapp::parse_args(&usage);
let infile = BufReader::new(args.get_infile("infile"));
let mut outfile = BufWriter::new(args.get_outfile("outfile"));
if args.get_bool("version") { if args.get_bool("version") {
println!("Version {} licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>", println!("Version {} licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>",
@ -25,6 +23,11 @@ fn main() {
std::process::exit(0); std::process::exit(0);
} }
if !args.get_bool("buffered") && args.get_path("infile") == args.get_path("outfile") {
eprintln!("Warning: input and output file are the same. This is not supported without using --buffered, because it would result in removing the file without processing any input.");
std::process::exit(128);
}
let nfd = args.get_bool("nfd"); let nfd = args.get_bool("nfd");
let nfkd = args.get_bool("nfkd"); let nfkd = args.get_bool("nfkd");
let nfc = args.get_bool("nfc"); let nfc = args.get_bool("nfc");
@ -33,10 +36,26 @@ fn main() {
if trues(&[nfd, nfkd, nfc, nfkc]) > 1 { if trues(&[nfd, nfkd, nfc, nfkc]) > 1 {
eprintln!("--nfd, --nfkd, --nfc, and --nfkc are mutually exclusive."); eprintln!("--nfd, --nfkd, --nfc, and --nfkc are mutually exclusive.");
std::process::exit(1); std::process::exit(128);
} }
for line in infile.lines() { // If buffering is needed, we have to read the whole file BEFORE opening it for
// writing, or else it will be truncated and all the contents will be lost.
let instream: Box<dyn BufRead> = {
if args.get_bool("buffered") {
let mut buffer = Vec::new();
args.get_infile("infile")
.read_to_end(&mut buffer)
.expect("Could not read input file into buffer. Error");
Box::new(Cursor::new(buffer))
} else {
Box::new(BufReader::new(args.get_infile("infile")))
}
};
let mut outfile = BufWriter::new(args.get_outfile("outfile"));
for line in instream.lines() {
let mut line = line.expect("Could not read line from file. Error").clone(); let mut line = line.expect("Could not read line from file. Error").clone();
if args.get_bool("crlf") { if args.get_bool("crlf") {
line.push('\x0D'); line.push('\x0D');