#! /usr/bin/env dub
/++ dub.sdl:
configuration "release" {
targetType "executable"
configuration "unittest" {
targetType "library"
dependency "silly" version="~>1.1.1"
/+ unicode note:
"noël" is "noe\u0308l"
it should have two dots over the 'e'
this is how it displays in urvxt, gnome-terminal, Firefox, and Brave
if it looks like "noeI", try another terminal
to confirm, try editing and running the examples at
version (unittest) {
} else {
void main() {
import std.process : spawnProcess, wait;
spawnProcess(["dub", "test", "--single", __FILE__, "--", "--threads=1"]).wait;
@("UTF-8 length in bytes, codepoints, graphemes")
@safe unittest {
import std.uni : byCodePoint, byGrapheme;
import std.algorithm : count;
enum s1 = "noël";
enum s2 = "ë";
assert(s1.length == 6);
assert(s2.length == 3);
assert(s1.byCodePoint.count == 5);
assert(s2.byCodePoint.count == 2);
assert(s1.byGrapheme.count == 4);
assert(s2.byGrapheme.count == 1);
@("UTF-8 indexing by byte, codepoint, grapheme")
@safe unittest {
import std.uni : byCodePoint, byGrapheme, Grapheme;
import std.range : drop, indexed;
import std.array : array;
import std.algorithm : map, joiner;
import std.encoding : index, decode;
enum s1 = "noël";
enum s2 = "ë";
assert(s1[0..3] ~ s1[$-1] == "noel");
assert(s1[2..5] == s2);
const cbd = s1.byCodePoint.drop(3).front; // combining diaeresis
enum s3 = "noël"d;
assert(s3[3] == cbd); // dchars = codepoints
assert(s1 == ['n', 'o', 'e', cbd, 'l']);
assert("nöel" == ['n', 'o', cbd, 'e', 'l']);
assert("nöel" == s1.byCodePoint.array.indexed([0, 1, 3, 2, 4]).array);
// lambda to give decode a ref
assert((s => s.decode)(s1[s1.index(3) .. $]) == cbd);
auto g1 = Grapheme("ë");
assert(s1.byGrapheme.drop(2).front == g1);
assert(g1[].array == s2);
assert("nëol" == s1.byGrapheme.array.indexed([0, 2, 1, 3]).map!(g => g[].array).joiner.array);
@("validating UTF")
@safe unittest {
import std.utf : validate, UTFException;
import std.encoding : isValid;
import std.exception : assertThrown, assertNotThrown;
enum s1 = "noël";
enum s2 = "hello\247\205\257there";
@("sanitizing UTF")
unittest {
import std.encoding : sanitize, validLength;
import std.utf : byDchar, replacementDchar;
import std.algorithm : equal, filter;
string s1 = "hello\247\205\257there";
dchar[] replaced = [104, 101, 108, 108, 111, 65533, 65533, 65533, 116, 104, 101, 114, 101];
assert("hellothere".equal(s1.byDchar.filter!(c => c != replacementDchar)));
assert("hello" == s1[0 .. s1.validLength]);
@("bytes -> chars while checking UTF")
unittest {
import std.string : representation, assumeUTF;
import std.utf : validate, UTFException;
import std.encoding : isValid;
import std.exception : enforce, assertThrown;
import core.exception : AssertError;
immutable(ubyte)[] b1 = "noël".representation;
immutable(ubyte)[] b2 = "hello\247\205\257there".representation;
string s1 = b1.assumeUTF; // only checks in debug builds!
auto s2 = cast(char[]) b1; // unchecked
s2.validate; // throws UTFException if invalid
assert(s2.isValid); // only checks if asserts run
enforce!UTFException(s2.isValid, "invalid UTF");
auto s3 = cast(char[]) b2;
@("byte-order-mark (BOM) ops")
unittest {
import std.encoding : getBOM, BOM, bomTable;
import std.string : representation;
enum s1 = "\x84\x31\x95\x33<- GB-18030".representation;
enum s2 = "\xEF\xBB\xBF<- UTF-8".representation;
enum s3 = "\xFF\xFE<- UTF-16 (LE)".representation;
enum s4 = "no BOM here".representation;
assert(s1[s1.getBOM.sequence.length .. $] == "<- GB-18030"); // stripping
assert(s2 == bomTable[BOM.utf8].sequence ~ "<- UTF-8".representation); // adding
assert(BOM.utf16le == s3.getBOM.schema); // getting
assert(s4.getBOM.schema == BOM.none); // checking