File basics/string_type_conversions.d from the latest check-in


#! /usr/bin/env dub
/++ dub.sdl:
    configuration "release" {
        targetType "executable"
    }
    configuration "unittest" {
        targetType "library"
        dependency "silly" version="~>1.1.1"
    }
+/
version (unittest) {
} else {
    void main() {
        import std.process : spawnProcess, wait;
        spawnProcess(["dub", "test", "--single", __FILE__, "--", "--threads=1"]).wait;
    }
}

/// char[] -> ubyte[] :: std.string.representation
/// - infalliable
/// - @safe
/// - non-allocating
/// - preserves qualifiers and also for wstring/dstring
@("std.string.representation")
nothrow @safe @nogc unittest {
    import std.string : representation;

    string s1 = "hello world";
    immutable(ubyte)[] s2 = s1.representation;
    assert(cast(size_t)s1.ptr == cast(size_t)s2.ptr);

    dstring s3 = "hello world"d;
    immutable(uint)[] s4 = s3.representation;
    assert(cast(size_t)s3.ptr == cast(size_t)s4.ptr);
}

/// ubyte[] -> char[] :: std.string.assumeUTF
/// - throws AssertError in debug builds if not valid UTF
/// - @safe
/// - non-allocating
/// - preserves qualifiers and also for wstring/dstring
@("std.string.assumeUTF")
nothrow @safe @nogc unittest {
    import std.string : assumeUTF;

    immutable(ubyte)[2] s1 = [104, 105];
    string s2 = s1.assumeUTF;
    assert(s2 == "hi");
    assert(cast(size_t)s1.ptr == cast(size_t)s2.ptr);
}

/// ubyte[] -> char[] :: cast(char[])
/// - can lead to decode errors later if input isn't valid UTF
/// - @safe when qualifiers aren't discarded
/// - non-allocating
/// consider std.encoding.isValid, returning a bool, or std.utf.validate,
/// throwing UTFException
@("cast(char[]) bytearray")
nothrow @safe @nogc unittest {
    import std.string : representation;

    immutable(ubyte)[] s1 = "hello world".representation;
    string s2 = cast(immutable(char)[]) s1;
    assert(cast(size_t)s1.ptr == cast(size_t)s2.ptr);
}

/// string mutability :: .dup, .idup, std.exception.assumeUnique, std.array.array
@(".dup .idup .assumeUnique .array")
@safe nothrow unittest {
    import std.exception : assumeUnique, assumeWontThrow;
    import std.algorithm : map;
    import std.array : array;

    char[] s1 = "hello world".dup; // copy string literal to get new mutable
    string s2 = s1.idup; // copy mutable to get new immutable

    // unsafe, but doesn't allocate or copy
    (() @trusted @nogc {
        string s3 = s1.assumeUnique;
    })();

    // .map just to illustrate: string -> range char -> char[]
    // can throw if s2 isn't valid UTF
    // .array copies range to get new mutable
    char[] s4 = s2.map!(c => cast(char)(cast(ubyte)c+1)).array.assumeWontThrow;
    assert(s4 == "ifmmp!xpsme");
}

/// char range autodecoding
/// not a type conversion, but a frequent need for them
@("avoiding autodecoding")
@safe unittest {
    import std.algorithm : map, sum, count;
    import std.exception : assertThrown;
    import std.utf : UTFException, validate, byCodeUnit;
    import std.string : representation;
    import std.encoding : isValid;

    char[] s1 = [167, 133, 175];
    assertThrown!UTFException(s1.validate);     // is invalid UTF8 sequence
    assert(!s1.isValid);                        // still invalid UTF8 sequence
    assert(s1.count == 3);                      // .count of a char[] happens to not autodecode
    assertThrown!UTFException(s1.map!"1".sum == 3); // but .map does
    assert(s1.representation.map!"1".sum == 3); // ubyte[] to avoid autodecoding

    // same autodecode-avoidance as .representation while retaining char elementtype
    assert(s1.byCodeUnit.map!"1".sum == 3);
}

/// string literal -> C string :: address of first char
/// - string literals always have a trailing NUL
/// - string literals also coerce directly to C strings
@("string literal conversion to C strings")
nothrow @safe @nogc unittest {
    string s1 = "hello world";
    const(char)* s2 = &s1[0];
    const(char)* s3 = "hello world";
}
@("string literals have trailing NUL")
nothrow @nogc unittest {
    string s1 = "hello world";
    assert(s1.ptr[s1.length] == '\0');
}

/// string -> C string :: std.string.toStringz
/// - works for any D string
/// - @safe
/// - *always* allocates
/// - preserves qualifiers
@("std.string.toStringz")
nothrow @safe unittest {
    import std.string : toStringz;

    string s1 = "hello world";
    const(char)* s2 = s1[0 .. 5].toStringz;
    const(char)* s3 = s1[$-5 .. $].toStringz;

    (() @trusted {
        import core.stdc.string : strlen;
        assert(s2[0 .. s2.strlen] == "hello");
        assert(s3[0 .. s3.strlen] == "world");
        assert(s1.ptr != s2);
    })();
}

/// C string -> string :: std.string.fromStringz
/// - just slices the pointer
/// - doesn't allocate
/// - not @safe because it assumes NUL termination
@("std.string.fromStringz")
nothrow @nogc unittest {
    import std.string : fromStringz;
    import core.stdc.string : strlen;

    const(char)* s1 = "hello world";
    const(char)[] s2 = s1.fromStringz;
    assert(&s1[0] == &s2[0]);
    assert(s1[0 .. s1.strlen] == s2);
}


/// char[] <-> ubyte[] aren't symmetric:
/// char[]  -> ubyte[] is always safe, error-free, and cheap
/// char[] <-  ubyte[] has the possibility of encoding failure
/// so char[] <- ubyte[] options above are convenient but somewhat sloppy
/// for precise operations, use std.utf tooling
@("std.utf.decodeFront")
unittest {
    const(char)[] s1 = [
        'h', 'e', 'l', 'l', 'o', // valid UTF-8
        167, 133, 175,           // invalid
        't', 'h', 'e', 'r', 'e'  // valid
    ];

    // take as much valid UTF-8 as we can
    dstring takeUTF(const(char)[] s) {
        import std.utf : decodeFront, UTFException;

        dstring res;
        while (s.length) {
            try {
                res ~= s.decodeFront;
            } catch (UTFException _) {
                break;
            }
        }
        return res;
    }
    assert(takeUTF(s1) == "hello");

    // I wrote most of this file without being aware of std.encoding at all :/
    import std.encoding : validLength;
    import std.algorithm : equal;
    assert(takeUTF(s1).equal(s1[0 .. s1.validLength]));
}

/// char[] <-> wchar[] <-> dchar[] :: std.utf.byUTF family
/// - infallible (ignoring errors or using the Unicode 'replacement character')
/// - @safe
/// - non-allocating
@("std.utf.byUTF 1/3")
nothrow @nogc @safe unittest {
    import std.utf : byChar, byWchar, byDchar;
    import std.range : enumerate;

    string noel = "no\u0308el";
    char[6] chars = [110, 111, 204, 136, 101, 108];
    dchar[5] dchars = [110, 111, 776, 101, 108];

    foreach (size_t i, char c; noel.byChar.enumerate)
        assert(c == chars[i]);
    foreach (size_t i, dchar c; noel.byDchar.enumerate)
        assert(c == dchars[i]);
}
/// can throw only if not using the Unicode replacement character
@("std.utf.byUTF 2/3 (throwing)")
@safe unittest {
    import std.utf : byUTF, UTFException, UseReplacementDchar;
    import std.exception : assertThrown, assertNotThrown;
    import std.algorithm : count;

    string partial = "hello\247\205\257there";

    // byChar misses the bad UTF8 ...
    assertNotThrown!UTFException(partial.byUTF!(char, UseReplacementDchar.no).count);

    // byDchar objects to it
    assertThrown!UTFException(partial.byUTF!(dchar, UseReplacementDchar.no).count);
}
@("stf.utf.byUTF 3/3 (replacement character)")
nothrow @nogc @safe unittest {
    import std.utf : byUTF, byDchar, UseReplacementDchar;
    import std.range : enumerate;

    string partial = "hello\247\205\257there";
    dchar[13] replaced = [104, 101, 108, 108, 111, 65533, 65533, 65533, 116, 104, 101, 114, 101];

    foreach (size_t i, char c; partial.byUTF!(char, UseReplacementDchar.yes).enumerate)
        assert(c == partial[i]); // invalid UTF ignored and passed through
    foreach (size_t i, dchar c; partial.byUTF!(dchar, UseReplacementDchar.yes).enumerate)
        assert(c == replaced[i]);
    foreach (size_t i, dchar c; partial.byDchar.enumerate)
        assert(c == replaced[i]);
}
/// byUTF returns a range so can replace characters within @nogc
/// sanitize has a simpler interface that allocates and isn't @safe.
/// so which should you use?
/// sanitize is *much* faster, more than twice as fast as simple byDchar.array
/// competition, and still 20% faster than non-allocating byDchar uses.
/// ... and this is if sanization is necessary. In the very common case of
/// already sanitized inputs, sanitize is 4-6x faster and doesn't allocate.
@("std.encoding.sanitize")
nothrow unittest {
    import std.encoding : sanitize;
    import std.utf : toUTF8;

    string partial = "hello\247\205\257there";
    dchar[] replaced = [104, 101, 108, 108, 111, 65533, 65533, 65533, 116, 104, 101, 114, 101];

    assert(partial.sanitize == replaced.toUTF8);
}

/// string <-> wstring <-> dstring <-> C string :: stf.utf.toUTF* family
/// - infallible (in same way as byUTF family)
/// - @safe
/// - always allocates
@("stf.utf.toUTF8 1/2")
nothrow @safe unittest {
    import std.utf : toUTF8, toUTF16, toUTF32, toUTFz, toUTF16z;

    string noel = "no\u0308el";
    char[6] chars = [110, 111, 204, 136, 101, 108];
    dchar[5] dchars = [110, 111, 776, 101, 108];

    assert(noel.toUTF8 == chars);
    assert(noel.toUTF32 == dchars);
}
@("std.utf.toUTF 2/2")
nothrow @safe unittest {
    import std.utf : toUTF8, toUTF32;

    string partial = "hello\247\205\257there";
    dchar[13] replaced = [104, 101, 108, 108, 111, 65533, 65533, 65533, 116, 104, 101, 114, 101];

    assert(partial.toUTF8 == partial);
    assert(partial.toUTF8.ptr != partial.ptr);
    assert(partial.toUTF32 == replaced);

    string hello = "hello"d.toUTF8;
    dstring there = "there".toUTF32;
}

/// string <-> wcstring <-> dstring :: std.conv.to
/// - usually fallible, throws on invalid UTF
/// - @safe
/// - usually allocates (doesn't when idempotent)
@("stf.conv.to 1/4 (can throw)")
@safe unittest {
    import std.conv : to;
    import std.utf : UTFException;
    import std.exception : assertThrown;

    string partial = "hello\247\205\257there";
    assertThrown!UTFException(partial.to!dstring);
}
@("stf.conv.to 2/4 (idempotent use)")
nothrow @nogc @safe unittest {
    import std.conv : to;

    dstring s1 = "hello"d;
    dstring s2 = s1.to!dstring;
    assert(s1.ptr == s2.ptr);
}
@("stf.conv.to 3/4 (doesn't understand char[] -> ubyte[])")
@safe unittest {
    import std.conv : to, ConvException;
    import std.exception : assertThrown;

    const(char)[] s1 = "hello";
    assertThrown!ConvException(s1.to!(ubyte[]));
}
@("stf.conv.to 4/4 (conv doesn't do what you want with ubyte[] -> string)")
@safe unittest {
    import std.conv : to;

    ubyte[5] s1 = ['h', 'e', 'l', 'l', 'o'];
    assert(s1.to!string == "[104, 101, 108, 108, 111]");
}