Browse Source

Add --buffered and same stream check

Before this, running the program with the same input and output file
would destroy the file's contents. Now, it isn't allowed without the new
--buffered option, which reads the whole file into memory before opening
it for writing.

This requires stevedonovan/lapp#1 to be merged so for now this version
is not publishable to crates.io as it is using my fork of lapp.
master
Leonora Tindall 11 months ago
parent
commit
80848d0802
Signed by: nora <nora@nora.codes> GPG Key ID: 99041B68DBC02DAC
4 changed files with 41 additions and 13 deletions
  1. +4
    -4
      Cargo.lock
  2. +2
    -2
      Cargo.toml
  3. +11
    -2
      README
  4. +24
    -5
      src/main.rs

+ 4
- 4
Cargo.lock View File

@@ -3,7 +3,7 @@
[[package]]
name = "lapp"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
source = "git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths#0fb2df69ce5262958401214fca0d0f31d62b1062"

[[package]]
name = "smallvec"
@@ -20,13 +20,13 @@ dependencies = [

[[package]]
name = "utf8-norm"
version = "1.0.1"
version = "1.1.0"
dependencies = [
"lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)",
"unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)",
]

[metadata]
"checksum lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60bf485afeba9437a275ad29a9383b03f2978450e7feceffb55be8c0dbad9829"
"checksum lapp 0.4.0 (git+https://github.com/NoraCodes/lapp?branch=nora/iofile_paths)" = "<none>"
"checksum smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab606a9c5e214920bb66c458cd7be8ef094f813f20fe77a54cc7dbfff220d4b7"
"checksum unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "141339a08b982d942be2ca06ff8b076563cbe223d1befd5450716790d44e2426"

+ 2
- 2
Cargo.toml View File

@@ -1,6 +1,6 @@
[package]
name = "utf8-norm"
version = "1.0.1"
version = "1.1.0"
authors = ["Leonora Tindall <nora@nora.codes>"]
edition = "2018"
license = "GPL-3.0-only"
@@ -12,6 +12,6 @@ keywords = ["unicode", "normalize"]
categories = ["command-line-utilities", "internationalization", "localization", "text-processing"]

[dependencies]
lapp = "0.4"
lapp = { git = "https://github.com/NoraCodes/lapp", branch = "nora/iofile_paths" }
unicode-normalization = "0.1"


+ 11
- 2
README View File

@@ -1,9 +1,13 @@
utf8-norm, validate and normalize UTF-8 Unicode data

Version 1.0.1 licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>
ABOUT

Version 1.1.0 licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>
Fast command line Unicode normalization, supporting stream safety transformations as well
as NFC, NFD, NFKD, and NFKC. Exits with failure if the incoming stream is not valid UTF-8.

USAGE

Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] <infile> <outfile>

<infile> (default stdin) - file from which to read bytes.
@@ -14,7 +18,12 @@ Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] <inf
-c, --nfc - write NFC (canonical composition computed from NFD). This is the default.
-C, --nfkc - write NFKC (canonical composition computed from NFC).
-s, --stream-safe - write stream-safe bytes (Conjoining Grapheme Joiners, UAX15-D4).
-V, --version - output version information and exit
-b, --buffered - read the entire input file into memory before operating on it.
-V, --version - output version information and exit.

The --buffered option is primarily useful for reading and writing to the same file. It will
read bytes from the input until end of file and only then begin processing lines of the
input.

utf8-norm was created at Rust Belt Rust 2019 in Dayton, OH. Thanks to @j41manning for her
excellent talk regarding Unicode handling in Rust.


+ 24
- 5
src/main.rs View File

@@ -1,6 +1,6 @@
extern crate lapp;
extern crate unicode_normalization;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::io::{BufRead, BufReader, BufWriter, Write, Cursor};
use unicode_normalization::UnicodeNormalization;

// usize because that's the max number of bools in the slice
@@ -16,8 +16,6 @@ fn main() {
let usage = include_str!("../README");
let args = lapp::parse_args(&usage);

let infile = BufReader::new(args.get_infile("infile"));
let mut outfile = BufWriter::new(args.get_outfile("outfile"));

if args.get_bool("version") {
println!("Version {} licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>",
@@ -25,6 +23,11 @@ fn main() {
std::process::exit(0);
}

if !args.get_bool("buffered") && args.get_path("infile") == args.get_path("outfile") {
eprintln!("Warning: input and output file are the same. This is not supported without using --buffered, because it would result in removing the file without processing any input.");
std::process::exit(128);
}

let nfd = args.get_bool("nfd");
let nfkd = args.get_bool("nfkd");
let nfc = args.get_bool("nfc");
@@ -33,10 +36,26 @@ fn main() {

if trues(&[nfd, nfkd, nfc, nfkc]) > 1 {
eprintln!("--nfd, --nfkd, --nfc, and --nfkc are mutually exclusive.");
std::process::exit(1);
std::process::exit(128);
}

for line in infile.lines() {
// If buffering is needed, we have to read the whole file BEFORE opening it for
// writing, or else it will be truncated and all the contents will be lost.
let instream: Box<dyn BufRead> = {
if args.get_bool("buffered") {
let mut buffer = Vec::new();
args.get_infile("infile")
.read_to_end(&mut buffer)
.expect("Could not read input file into buffer. Error");
Box::new(Cursor::new(buffer))
} else {
Box::new(BufReader::new(args.get_infile("infile")))
}
};

let mut outfile = BufWriter::new(args.get_outfile("outfile"));

for line in instream.lines() {
let mut line = line.expect("Could not read line from file. Error").clone();
if args.get_bool("crlf") {
line.push('\x0D');


Loading…
Cancel
Save