Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:add basics/unicode_ops.d
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 7187a3dbcd18426455ac38e02b89879c9a14ce66d86f55ceeebc4dd1aaaaa98d
User & Date: admin 2021-10-03 08:10:11
Context
2021-10-03
08:14
add BOM.none check-in: 8d26af00d9 user: admin tags: trunk
08:10
add basics/unicode_ops.d check-in: 7187a3dbcd user: admin tags: trunk
07:30
delete trailing whitespace check-in: cb052e5763 user: admin tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Added basics/unicode_ops.d.





























































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#! /usr/bin/env dub
/++ dub.sdl:
    configuration "release" {
        targetType "executable"
    }
    configuration "unittest" {
        targetType "library"
        dependency "silly" version="~>1.1.1"
    }
+/
/+ unicode note:
    "noël" is "noe\u0308l"
    it should have two dots over the 'e'
    this is how it displays in urvxt, gnome-terminal, Firefox, and Brave
    if it looks like "noeI", try another terminal

    to confirm, try editing and running the examples at
    https://dlang.org/phobos/std_uni.html#.byCodePoint
+/
version (unittest) {
} else {
    void main() {
        import std.process : spawnProcess, wait;
        spawnProcess(["dub", "test", "--single", __FILE__, "--", "--threads=1"]).wait;
    }
}

@("UTF-8 length in bytes, codepoints, graphemes")
@safe unittest {
    import std.uni : byCodePoint, byGrapheme;
    import std.algorithm : count;
    enum s1 = "noël";
    enum s2 = "ë";

    assert(s1.length == 6);
    assert(s2.length == 3);

    assert(s1.byCodePoint.count == 5);
    assert(s2.byCodePoint.count == 2);

    assert(s1.byGrapheme.count == 4);
    assert(s2.byGrapheme.count == 1);
}

@("UTF-8 indexing by byte, codepoint, grapheme")
@safe unittest {
    import std.uni : byCodePoint, byGrapheme, Grapheme;
    import std.range : drop, indexed;
    import std.array : array;
    import std.algorithm : map, joiner;
    import std.encoding : index, decode;

    enum s1 = "noël";
    enum s2 = "ë";

    assert(s1[0..3] ~ s1[$-1] == "noel");
    assert(s1[2..5] == s2);

    const cbd = s1.byCodePoint.drop(3).front; // combining diaeresis
    enum s3 = "noël"d;
    assert(s3[3] == cbd); // dchars = codepoints
    assert(s1 == ['n', 'o', 'e', cbd, 'l']);
    assert("nöel" == ['n', 'o', cbd, 'e', 'l']);
    assert("nöel" == s1.byCodePoint.array.indexed([0, 1, 3, 2, 4]).array);

    // lambda to give decode a ref
    assert((s => s.decode)(s1[s1.index(3) .. $]) == cbd);

    auto g1 = Grapheme("ë");
    assert(s1.byGrapheme.drop(2).front == g1);
    assert(g1[].array == s2);
    assert("nëol" == s1.byGrapheme.array.indexed([0, 2, 1, 3]).map!(g => g[].array).joiner.array);
}

@("validating UTF")
@safe unittest {
    import std.utf : validate, UTFException;
    import std.encoding : isValid;
    import std.exception : assertThrown, assertNotThrown;

    enum s1 = "noël";
    enum s2 = "hello\247\205\257there";

    assertNotThrown!UTFException(s1.validate);
    assertThrown!UTFException(s2.validate);
    assert(s1.isValid);
    assert(!s2.isValid);
}

@("sanitizing UTF")
unittest {
    import std.encoding : sanitize, validLength;
    import std.utf : byDchar, replacementDchar;
    import std.algorithm : equal, filter;

    string s1 = "hello\247\205\257there";
    dchar[] replaced = [104, 101, 108, 108, 111, 65533, 65533, 65533, 116, 104, 101, 114, 101];

    assert("hellothere".equal(s1.byDchar.filter!(c => c != replacementDchar)));
    assert("hello" == s1[0 .. s1.validLength]);
    assert(s1.byDchar.equal(replaced));
    assert(s1.sanitize.equal(replaced));
}

@("bytes -> chars while checking UTF")
unittest {
    import std.string : representation, assumeUTF;
    import std.utf : validate, UTFException;
    import std.encoding : isValid;
    import std.exception : enforce, assertThrown;
    import core.exception : AssertError;

    immutable(ubyte)[] b1 = "noël".representation;
    immutable(ubyte)[] b2 = "hello\247\205\257there".representation;

    string s1 = b1.assumeUTF;   // only checks in debug builds!

    auto s2 = cast(char[]) b1;  // unchecked
    s2.validate;                // throws UTFException if invalid
    assert(s2.isValid);         // only checks if asserts run
    enforce!UTFException(s2.isValid, "invalid UTF");

    assertThrown!AssertError(b2.assumeUTF);
    auto s3 = cast(char[]) b2;
    assertThrown!UTFException(s3.validate);
    assert(!s3.isValid);
    assertThrown!UTFException(enforce!UTFException(s3.isValid));
}

@("byte-order-mark (BOM) ops")
unittest {
    import std.encoding : getBOM, BOM, bomTable;
    import std.string : representation;

    enum s1 = "\x84\x31\x95\x33<- GB-18030".representation;
    enum s2 = "\xEF\xBB\xBF<- UTF-8".representation;
    enum s3 = "\xFF\xFE<- UTF-16 (LE)".representation;

    assert(s1[s1.getBOM.sequence.length .. $] == "<- GB-18030"); // stripping
    assert(s2 == bomTable[BOM.utf8].sequence ~ "<- UTF-8".representation); // adding
    assert(BOM.utf16le == s3.getBOM.schema); // checking
}