Browse Source

First commit - working version from RBR

master
Leonora Tindall 11 months ago
commit
c44f33a056
Signed by: nora <nora@nora.codes> GPG Key ID: 99041B68DBC02DAC
5 changed files with 140 additions and 0 deletions
  1. +2
    -0
      .gitignore
  2. +32
    -0
      Cargo.lock
  3. +17
    -0
      Cargo.toml
  4. +23
    -0
      README
  5. +66
    -0
      src/main.rs

+ 2
- 0
.gitignore View File

@@ -0,0 +1,2 @@
/target
**/*.rs.bk

+ 32
- 0
Cargo.lock View File

@@ -0,0 +1,32 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "lapp"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"

[[package]]
name = "smallvec"
version = "0.6.10"
source = "registry+https://github.com/rust-lang/crates.io-index"

[[package]]
name = "unicode-normalization"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)",
]

[[package]]
name = "utf8-norm"
version = "1.0.0"
dependencies = [
"lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)",
]

[metadata]
"checksum lapp 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60bf485afeba9437a275ad29a9383b03f2978450e7feceffb55be8c0dbad9829"
"checksum smallvec 0.6.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab606a9c5e214920bb66c458cd7be8ef094f813f20fe77a54cc7dbfff220d4b7"
"checksum unicode-normalization 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "141339a08b982d942be2ca06ff8b076563cbe223d1befd5450716790d44e2426"

+ 17
- 0
Cargo.toml View File

@@ -0,0 +1,17 @@
[package]
name = "utf8-norm"
version = "1.0.0"
authors = ["Leonora Tindall <nora@nora.codes>"]
edition = "2018"
license = "GPL-3.0-only"
description = "Command line tool to validate and normalize UTF-8 data"
readme = "README"
homepage = "https://nora.codes/projects/utf8-norm"
repository = "https://git.nora.codes/nora/utf-norm"
keywords = ["unicode", "normalize"]
categories = ["command-line=utilities", "internationalization", "localization", "text-processing"]

[dependencies]
lapp = "0.4"
unicode-normalization = "0.1"


+ 23
- 0
README View File

@@ -0,0 +1,23 @@
utf8-norm, validate and normalize UTF-8 Unicode data

Version 1.0.0 licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>
Fast command line Unicode normalization, supporting stream safety transformations as well
as NFC, NFD, NFKD, and NFKC. Exits with failure if the incoming stream is not valid UTF-8.

Usage: utf8-norm [--nfc | --nfd | --nfkc | --nfkd] [--stream-safe] [--crlf] <infile> <outfile>

<infile> (default stdin) - file from which to read bytes.
<outfile> (default stdout) - file to which to write normalized Unicode.
-w, --crlf - write CRLF (Windows) instead of LF only (Unix) at the end of lines.
-d, --nfd - write NFD (canonical decomposition).
-D, --nfkd - write NFKD (compatibility decomposition).
-c, --nfc - write NFC (canonical composition computed from NFD). This is the default.
-C, --nfkc - write NFKC (canonical composition computed from NFC).
-s, --stream-safe - write stream-safe bytes (Conjoining Grapheme Joiners, UAX15-D4).
-V, --version - output version information and exit

utf8-norm was created at Rust Belt Rust 2019 in Dayton, OH. Thanks to @j41manning for her
excellent talk regarding Unicode handling in Rust.

Natively install as `cargo install utf8-norm` or from your distribution's package manager.


+ 66
- 0
src/main.rs View File

@@ -0,0 +1,66 @@
extern crate lapp;
extern crate unicode_normalization;
use std::io::{BufRead, BufReader, BufWriter, Write};
use unicode_normalization::UnicodeNormalization;

// usize because that's the max number of bools in the slice
fn trues(bools: &[bool]) -> usize {
let mut n = 0;
for b in bools {
if *b { n += 1 }
}
n
}

fn main() {
let usage = include_str!("../README");
let args = lapp::parse_args(&usage);

let infile = BufReader::new(args.get_infile("infile"));
let mut outfile = BufWriter::new(args.get_outfile("outfile"));

if args.get_bool("version") {
println!("Version {} licensed GPLv3. (C) 2019 Leonora Tindall <nora@nora.codes>",
env!("CARGO_PKG_VERSION"));
std::process::exit(0);
}

let nfd = args.get_bool("nfd");
let nfkd = args.get_bool("nfkd");
let nfc = args.get_bool("nfc");
let nfkc = args.get_bool("nfkc");
let ss = args.get_bool("stream-safe");

if trues(&[nfd, nfkd, nfc, nfkc]) > 1 {
eprintln!("--nfd, --nfkd, --nfc, and --nfkc are mutually exclusive.");
std::process::exit(1);
}

for line in infile.lines() {
let mut line = line.expect("Could not read line from file. Error").clone();
if args.get_bool("crlf") {
line.push('\x0D');
}
line.push('\x0A');

let normalized: Box<dyn Iterator<Item=char>>;
if nfd {
normalized = Box::new(line.chars().nfd());
} else if nfkc {
normalized = Box::new(line.chars().nfkc());
} else if nfkd {
normalized = Box::new(line.chars().nfkd());
} else {
normalized = Box::new(line.chars().nfc());
}

let output: String;
if ss {
output = normalized.stream_safe().collect();
} else {
output = normalized.collect();
}

write!(&mut outfile, "{}", output).expect("Could not write to output. Error");
}
}

Loading…
Cancel
Save