Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | add basics/unicode_ops.d |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA3-256: |
7187a3dbcd18426455ac38e02b89879c |
User & Date: | admin 2021-10-03 08:10:11 |
Context
2021-10-03
| ||
08:14 | add BOM.none check-in: 8d26af00d9 user: admin tags: trunk | |
08:10 | add basics/unicode_ops.d check-in: 7187a3dbcd user: admin tags: trunk | |
07:30 | delete trailing whitespace check-in: cb052e5763 user: admin tags: trunk | |
Changes
Added basics/unicode_ops.d.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#! /usr/bin/env dub /++ dub.sdl: configuration "release" { targetType "executable" } configuration "unittest" { targetType "library" dependency "silly" version="~>1.1.1" } +/ /+ unicode note: "noël" is "noe\u0308l" it should have two dots over the 'e' this is how it displays in urvxt, gnome-terminal, Firefox, and Brave if it looks like "noeI", try another terminal to confirm, try editing and running the examples at https://dlang.org/phobos/std_uni.html#.byCodePoint +/ version (unittest) { } else { void main() { import std.process : spawnProcess, wait; spawnProcess(["dub", "test", "--single", __FILE__, "--", "--threads=1"]).wait; } } @("UTF-8 length in bytes, codepoints, graphemes") @safe unittest { import std.uni : byCodePoint, byGrapheme; import std.algorithm : count; enum s1 = "noël"; enum s2 = "ë"; assert(s1.length == 6); assert(s2.length == 3); assert(s1.byCodePoint.count == 5); assert(s2.byCodePoint.count == 2); assert(s1.byGrapheme.count == 4); assert(s2.byGrapheme.count == 1); } @("UTF-8 indexing by byte, codepoint, grapheme") @safe unittest { import std.uni : byCodePoint, byGrapheme, Grapheme; import std.range : drop, indexed; import std.array : array; import std.algorithm : map, joiner; import std.encoding : index, decode; enum s1 = "noël"; enum s2 = "ë"; assert(s1[0..3] ~ s1[$-1] == "noel"); assert(s1[2..5] == s2); const cbd = s1.byCodePoint.drop(3).front; // combining diaeresis enum s3 = "noël"d; assert(s3[3] == cbd); // dchars = codepoints assert(s1 == ['n', 'o', 'e', cbd, 'l']); assert("nöel" == ['n', 'o', cbd, 'e', 'l']); assert("nöel" == s1.byCodePoint.array.indexed([0, 1, 3, 2, 4]).array); // lambda to give decode a ref assert((s => s.decode)(s1[s1.index(3) .. $]) == cbd); auto g1 = Grapheme("ë"); assert(s1.byGrapheme.drop(2).front == g1); assert(g1[].array == s2); assert("nëol" == s1.byGrapheme.array.indexed([0, 2, 1, 3]).map!(g => g[].array).joiner.array); } @("validating UTF") @safe unittest { import std.utf : validate, UTFException; import std.encoding : isValid; import std.exception : assertThrown, assertNotThrown; enum s1 = "noël"; enum s2 = "hello\247\205\257there"; assertNotThrown!UTFException(s1.validate); assertThrown!UTFException(s2.validate); assert(s1.isValid); assert(!s2.isValid); } @("sanitizing UTF") unittest { import std.encoding : sanitize, validLength; import std.utf : byDchar, replacementDchar; import std.algorithm : equal, filter; string s1 = "hello\247\205\257there"; dchar[] replaced = [104, 101, 108, 108, 111, 65533, 65533, 65533, 116, 104, 101, 114, 101]; assert("hellothere".equal(s1.byDchar.filter!(c => c != replacementDchar))); assert("hello" == s1[0 .. s1.validLength]); assert(s1.byDchar.equal(replaced)); assert(s1.sanitize.equal(replaced)); } @("bytes -> chars while checking UTF") unittest { import std.string : representation, assumeUTF; import std.utf : validate, UTFException; import std.encoding : isValid; import std.exception : enforce, assertThrown; import core.exception : AssertError; immutable(ubyte)[] b1 = "noël".representation; immutable(ubyte)[] b2 = "hello\247\205\257there".representation; string s1 = b1.assumeUTF; // only checks in debug builds! auto s2 = cast(char[]) b1; // unchecked s2.validate; // throws UTFException if invalid assert(s2.isValid); // only checks if asserts run enforce!UTFException(s2.isValid, "invalid UTF"); assertThrown!AssertError(b2.assumeUTF); auto s3 = cast(char[]) b2; assertThrown!UTFException(s3.validate); assert(!s3.isValid); assertThrown!UTFException(enforce!UTFException(s3.isValid)); } @("byte-order-mark (BOM) ops") unittest { import std.encoding : getBOM, BOM, bomTable; import std.string : representation; enum s1 = "\x84\x31\x95\x33<- GB-18030".representation; enum s2 = "\xEF\xBB\xBF<- UTF-8".representation; enum s3 = "\xFF\xFE<- UTF-16 (LE)".representation; assert(s1[s1.getBOM.sequence.length .. $] == "<- GB-18030"); // stripping assert(s2 == bomTable[BOM.utf8].sequence ~ "<- UTF-8".representation); // adding assert(BOM.utf16le == s3.getBOM.schema); // checking } |