Add --buffered and same stream check
Before this, running the program with the same input and output file would destroy the file's contents. Now, it isn't allowed without the new --buffered option, which reads the whole file into memory before opening it for writing. This requires stevedonovan/lapp#1 to be merged so for now this version is not publishable to crates.io as it is using my fork of lapp.
This commit is contained in:
parent
234678befe
commit
80848d0802
|
@ -3,7 +3,7 @@
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lapp"
|
name = "lapp"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths#0fb2df69ce5262958401214fca0d0f31d62b1062"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "smallvec"
|
name = "smallvec"
|
||||||
|
@ -20,13 +20,13 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8-norm"
|
name = "utf8-norm"
|
||||||
version = "1.0.1"
|
version = "1.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)",
|
||||||
"unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
"unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
"checksum lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60bf485afeba9437a275ad29a9383b03f2978450e7feceffb55be8c0dbad9829"
|
"checksum lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)" = "<none>"
|
||||||
"checksum smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab606a9c5e214920bb66c458cd7be8ef094f813f20fe77a54cc7dbfff220d4b7"
|
"checksum smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab606a9c5e214920bb66c458cd7be8ef094f813f20fe77a54cc7dbfff220d4b7"
|
||||||
"checksum unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "141339a08b982d942be2ca06ff8b076563cbe223d1befd5450716790d44e2426"
|
"checksum unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "141339a08b982d942be2ca06ff8b076563cbe223d1befd5450716790d44e2426"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "utf8-norm"
|
name = "utf8-norm"
|
||||||
version = "1.0.1"
|
version = "1.1.0"
|
||||||
authors = ["Leonora Tindall <nora@nora.codes>"]
|
authors = ["Leonora Tindall <nora@nora.codes>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
license = "GPL-3.0-only"
|
license = "GPL-3.0-only"
|
||||||
|
@ -12,6 +12,6 @@ keywords = ["unicode", "normalize"]
|
||||||
categories = ["command-line-utilities", "internationalization", "localization", "text-processing"]
|
categories = ["command-line-utilities", "internationalization", "localization", "text-processing"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
lapp = "0.4"
|
lapp = { git = "https://github.com/NoraCodes/lapp", branch = "nora/iofile_paths" }
|
||||||
unicode-normalization = "0.1"
|
unicode-normalization = "0.1"
|
||||||
|
|
||||||
|
|
13
README
13
README
|
@ -1,9 +1,13 @@
|
||||||
utf8-norm, validate and normalize UTF-8 Unicode data
|
utf8-norm, validate and normalize UTF-8 Unicode data
|
||||||
|
|
||||||
Version 1.0.1 licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>
|
ABOUT
|
||||||
|
|
||||||
|
Version 1.1.0 licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>
|
||||||
Fast command line Unicode normalization, supporting stream safety transformations as well
|
Fast command line Unicode normalization, supporting stream safety transformations as well
|
||||||
as NFC, NFD, NFKD, and NFKC. Exits with failure if the incoming stream is not valid UTF-8.
|
as NFC, NFD, NFKD, and NFKC. Exits with failure if the incoming stream is not valid UTF-8.
|
||||||
|
|
||||||
|
USAGE
|
||||||
|
|
||||||
Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] <infile> <outfile>
|
Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] <infile> <outfile>
|
||||||
|
|
||||||
<infile> (default stdin) - file from which to read bytes.
|
<infile> (default stdin) - file from which to read bytes.
|
||||||
|
@ -14,7 +18,12 @@ Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] <inf
|
||||||
-c, --nfc - write NFC (canonical composition computed from NFD). This is the default.
|
-c, --nfc - write NFC (canonical composition computed from NFD). This is the default.
|
||||||
-C, --nfkc - write NFKC (canonical composition computed from NFC).
|
-C, --nfkc - write NFKC (canonical composition computed from NFC).
|
||||||
-s, --stream-safe - write stream-safe bytes (Conjoining Grapheme Joiners, UAX15-D4).
|
-s, --stream-safe - write stream-safe bytes (Conjoining Grapheme Joiners, UAX15-D4).
|
||||||
-V, --version - output version information and exit
|
-b, --buffered - read the entire input file into memory before operating on it.
|
||||||
|
-V, --version - output version information and exit.
|
||||||
|
|
||||||
|
The --buffered option is primarily useful for reading and writing to the same file. It will
|
||||||
|
read bytes from the input until end of file and only then begin processing lines of the
|
||||||
|
input.
|
||||||
|
|
||||||
utf8-norm was created at Rust Belt Rust 2019 in Dayton, OH. Thanks to @j41manning for her
|
utf8-norm was created at Rust Belt Rust 2019 in Dayton, OH. Thanks to @j41manning for her
|
||||||
excellent talk regarding Unicode handling in Rust.
|
excellent talk regarding Unicode handling in Rust.
|
||||||
|
|
29
src/main.rs
29
src/main.rs
|
@ -1,6 +1,6 @@
|
||||||
extern crate lapp;
|
extern crate lapp;
|
||||||
extern crate unicode_normalization;
|
extern crate unicode_normalization;
|
||||||
use std::io::{BufRead, BufReader, BufWriter, Write};
|
use std::io::{BufRead, BufReader, BufWriter, Write, Cursor};
|
||||||
use unicode_normalization::UnicodeNormalization;
|
use unicode_normalization::UnicodeNormalization;
|
||||||
|
|
||||||
// usize because that's the max number of bools in the slice
|
// usize because that's the max number of bools in the slice
|
||||||
|
@ -16,8 +16,6 @@ fn main() {
|
||||||
let usage = include_str!("../README");
|
let usage = include_str!("../README");
|
||||||
let args = lapp::parse_args(&usage);
|
let args = lapp::parse_args(&usage);
|
||||||
|
|
||||||
let infile = BufReader::new(args.get_infile("infile"));
|
|
||||||
let mut outfile = BufWriter::new(args.get_outfile("outfile"));
|
|
||||||
|
|
||||||
if args.get_bool("version") {
|
if args.get_bool("version") {
|
||||||
println!("Version {} licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>",
|
println!("Version {} licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>",
|
||||||
|
@ -25,6 +23,11 @@ fn main() {
|
||||||
std::process::exit(0);
|
std::process::exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !args.get_bool("buffered") && args.get_path("infile") == args.get_path("outfile") {
|
||||||
|
eprintln!("Warning: input and output file are the same. This is not supported without using --buffered, because it would result in removing the file without processing any input.");
|
||||||
|
std::process::exit(128);
|
||||||
|
}
|
||||||
|
|
||||||
let nfd = args.get_bool("nfd");
|
let nfd = args.get_bool("nfd");
|
||||||
let nfkd = args.get_bool("nfkd");
|
let nfkd = args.get_bool("nfkd");
|
||||||
let nfc = args.get_bool("nfc");
|
let nfc = args.get_bool("nfc");
|
||||||
|
@ -33,10 +36,26 @@ fn main() {
|
||||||
|
|
||||||
if trues(&[nfd, nfkd, nfc, nfkc]) > 1 {
|
if trues(&[nfd, nfkd, nfc, nfkc]) > 1 {
|
||||||
eprintln!("--nfd, --nfkd, --nfc, and --nfkc are mutually exclusive.");
|
eprintln!("--nfd, --nfkd, --nfc, and --nfkc are mutually exclusive.");
|
||||||
std::process::exit(1);
|
std::process::exit(128);
|
||||||
}
|
}
|
||||||
|
|
||||||
for line in infile.lines() {
|
// If buffering is needed, we have to read the whole file BEFORE opening it for
|
||||||
|
// writing, or else it will be truncated and all the contents will be lost.
|
||||||
|
let instream: Box<dyn BufRead> = {
|
||||||
|
if args.get_bool("buffered") {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
args.get_infile("infile")
|
||||||
|
.read_to_end(&mut buffer)
|
||||||
|
.expect("Could not read input file into buffer. Error");
|
||||||
|
Box::new(Cursor::new(buffer))
|
||||||
|
} else {
|
||||||
|
Box::new(BufReader::new(args.get_infile("infile")))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut outfile = BufWriter::new(args.get_outfile("outfile"));
|
||||||
|
|
||||||
|
for line in instream.lines() {
|
||||||
let mut line = line.expect("Could not read line from file. Error").clone();
|
let mut line = line.expect("Could not read line from file. Error").clone();
|
||||||
if args.get_bool("crlf") {
|
if args.get_bool("crlf") {
|
||||||
line.push('\x0D');
|
line.push('\x0D');
|
||||||
|
|
Loading…
Reference in New Issue