d notes: Artifact [dadacedbe4]

Artifact dadacedbe4e3f92fb3bfe3435c98a961d6e585eb42361d5752049547b3123fc8:

Executable file basics/unicode_ops.d — part of check-in [8d26af00d9] at 2021-10-03 08:14:23 on branch trunk — add BOM.none (user: admin size: 4579)
#! /usr/bin/env dub
/++ dub.sdl:
    configuration "release" {
        targetType "executable"
    }
    configuration "unittest" {
        targetType "library"
        dependency "silly" version="~>1.1.1"
    }
+/
/+ unicode note:
    "noël" is "noe\u0308l"
    it should have two dots over the 'e'
    this is how it displays in urvxt, gnome-terminal, Firefox, and Brave
    if it looks like "noeI", try another terminal

    to confirm, try editing and running the examples at
    https://dlang.org/phobos/std_uni.html#.byCodePoint
+/
version (unittest) {
} else {
    void main() {
        import std.process : spawnProcess, wait;
        spawnProcess(["dub", "test", "--single", __FILE__, "--", "--threads=1"]).wait;
    }
}

@("UTF-8 length in bytes, codepoints, graphemes")
@safe unittest {
    import std.uni : byCodePoint, byGrapheme;
    import std.algorithm : count;
    enum s1 = "noël";
    enum s2 = "ë";

    assert(s1.length == 6);
    assert(s2.length == 3);

    assert(s1.byCodePoint.count == 5);
    assert(s2.byCodePoint.count == 2);

    assert(s1.byGrapheme.count == 4);
    assert(s2.byGrapheme.count == 1);
}

@("UTF-8 indexing by byte, codepoint, grapheme")
@safe unittest {
    import std.uni : byCodePoint, byGrapheme, Grapheme;
    import std.range : drop, indexed;
    import std.array : array;
    import std.algorithm : map, joiner;
    import std.encoding : index, decode;

    enum s1 = "noël";
    enum s2 = "ë";

    assert(s1[0..3] ~ s1[$-1] == "noel");
    assert(s1[2..5] == s2);

    const cbd = s1.byCodePoint.drop(3).front; // combining diaeresis
    enum s3 = "noël"d;
    assert(s3[3] == cbd); // dchars = codepoints
    assert(s1 == ['n', 'o', 'e', cbd, 'l']);
    assert("nöel" == ['n', 'o', cbd, 'e', 'l']);
    assert("nöel" == s1.byCodePoint.array.indexed([0, 1, 3, 2, 4]).array);

    // lambda to give decode a ref
    assert((s => s.decode)(s1[s1.index(3) .. $]) == cbd);

    auto g1 = Grapheme("ë");
    assert(s1.byGrapheme.drop(2).front == g1);
    assert(g1[].array == s2);
    assert("nëol" == s1.byGrapheme.array.indexed([0, 2, 1, 3]).map!(g => g[].array).joiner.array);
}

@("validating UTF")
@safe unittest {
    import std.utf : validate, UTFException;
    import std.encoding : isValid;
    import std.exception : assertThrown, assertNotThrown;

    enum s1 = "noël";
    enum s2 = "hello\247\205\257there";

    assertNotThrown!UTFException(s1.validate);
    assertThrown!UTFException(s2.validate);
    assert(s1.isValid);
    assert(!s2.isValid);
}

@("sanitizing UTF")
unittest {
    import std.encoding : sanitize, validLength;
    import std.utf : byDchar, replacementDchar;
    import std.algorithm : equal, filter;

    string s1 = "hello\247\205\257there";
    dchar[] replaced = [104, 101, 108, 108, 111, 65533, 65533, 65533, 116, 104, 101, 114, 101];

    assert("hellothere".equal(s1.byDchar.filter!(c => c != replacementDchar)));
    assert("hello" == s1[0 .. s1.validLength]);
    assert(s1.byDchar.equal(replaced));
    assert(s1.sanitize.equal(replaced));
}

@("bytes -> chars while checking UTF")
unittest {
    import std.string : representation, assumeUTF;
    import std.utf : validate, UTFException;
    import std.encoding : isValid;
    import std.exception : enforce, assertThrown;
    import core.exception : AssertError;

    immutable(ubyte)[] b1 = "noël".representation;
    immutable(ubyte)[] b2 = "hello\247\205\257there".representation;

    string s1 = b1.assumeUTF;   // only checks in debug builds!

    auto s2 = cast(char[]) b1;  // unchecked
    s2.validate;                // throws UTFException if invalid
    assert(s2.isValid);         // only checks if asserts run
    enforce!UTFException(s2.isValid, "invalid UTF");

    assertThrown!AssertError(b2.assumeUTF);
    auto s3 = cast(char[]) b2;
    assertThrown!UTFException(s3.validate);
    assert(!s3.isValid);
    assertThrown!UTFException(enforce!UTFException(s3.isValid));
}

@("byte-order-mark (BOM) ops")
unittest {
    import std.encoding : getBOM, BOM, bomTable;
    import std.string : representation;

    enum s1 = "\x84\x31\x95\x33<- GB-18030".representation;
    enum s2 = "\xEF\xBB\xBF<- UTF-8".representation;
    enum s3 = "\xFF\xFE<- UTF-16 (LE)".representation;
    enum s4 = "no BOM here".representation;

    assert(s1[s1.getBOM.sequence.length .. $] == "<- GB-18030"); // stripping
    assert(s2 == bomTable[BOM.utf8].sequence ~ "<- UTF-8".representation); // adding
    assert(BOM.utf16le == s3.getBOM.schema); // getting
    assert(s4.getBOM.schema == BOM.none); // checking
}