Unescape a string
Within a string, an escape sequence gives special meaning to otherwise ordinary characters.
For example, in the C programming language, the string literal "a\nb"
, containing the escape sequence \n
, would be interpreted by the compiler as the bytes 61 0a 62
, not 61 5c 6e 62
.
In JavaScript Object Notation, the string value "a\u263Ac"
, containing the escape sequence \u263A
, would be parsed into "a☺c"
.
- Task
Write a function (or routine, subroutine, procedure, etc.) that, when given a UTF-8 encoded string, returns a new string, also encoded in UTF-8, with all escape sequences replaced with equivalent Unicode code points.
For this task we'll follow JSON string semantics, defined in Section 7 of RFC 8259. In summary:
- Allowed two-character escapes are
\\
,\"
,\/
,\b
,\f
,\n
,\r
,\t
. "
,\
and control characters (U+0000 through U+001F) must be escaped.- Any "character" in the Basic Multilingual Plane may be escaped with a
\uXXXX
sequence, whereXXXX
are hexadecimal digits encoding the character's code point - Characters outside the Basic Multilingual Plane may be escaped with a
\uXXXX\uXXXX
sequence, whereXXXX
are hexadecimal digits encoding the code point using UTF-16.
Although not required by RFC 8259, we'll treat strings containing invalid \u
escape sequences (those that can't be decoded to a Unicode code point) as an error condition.
Test your program with the following inputs and display the string before and after unescaping. It's OK to display bytes rather than "characters" if it's more convenient for your language, or both, if you prefer.
- Test cases
Input string | Expected result | Comment |
---|---|---|
abc |
abc |
No escape sequences |
a☺c |
a☺c |
No escape sequences, extended unicode |
a\"c |
a"c |
Two-character escape |
\u0061\u0062\u0063 |
abc |
Just escape sequences |
a\\c |
a\c |
Two-character escape for reverse solidus |
a\u263Ac |
a☺c |
\u escape, basic multilingual plane
|
a\\u263Ac |
a\u263Ac |
Escaped \ before u
|
a\uD834\uDD1Ec |
a𝄞c |
\u escape, surrogate pair
|
a\ud834\udd1ec |
a𝄞c |
Lower case hex digits |
a\u263 |
error | Incomplete escape sequence |
a\u263Xc |
error | Invalid hexadecimal digit |
a\uDD1Ec |
error | Lone low surrogate |
a\uD834c |
error | Lone high surrogate |
a\uD834\u263Ac |
error | High surrogate followed by non-surrogate |
- Related tasks
ALGOL 68
Shows the unescaped characters as integer codepoints as well as characters (NB, the integers are shown in decimal).
Algol 68 pre-dates Unicode and does not have separate byte and character types, so the question is: does a STRING hold codepoints or UTF-8 byte sequences ? A STRING is a row of CHAR, CHAR values range from 0 to max abs char
, so we can guess that if max abs char
is "large", STRINGs hold codepoints and if "small" they contain UTF-8. Historically, Multics had max abs char
set to 511 (9-bit characters) so this sample assumes Unicode if max abs char
is larger than that. Algol 68G and probably most current implementations of Algol 68 have max abs char
set to 255.
When displaying the unescaped STRINGs, the CHAROF operator is used to convert the codepoints - if Unicode is in use, no conversion is needed otherwise the codepoints are converted to UTF-8.
BEGIN # convert JSON-style escape sequences to characters #
# the following escapes are recognised: \\, \", \/, \b, \f, \n, \r, \t #
# and \uXXXX for characters in the Base Multilingual Plane and pairs #
# of u\XXXXu\XXXX of surrogates for characters outside that range #
# TRUE if CHAR contains Unicode codepoints, FALSE if bytes #
BOOL using unicode = max abs char > 511;
# returns the length of s #
OP LENGTH = ( STRING s )INT: ( UPB s - LWB s ) + 1;
# mode to hold a codepoint #
MODE CODEPOINT = INT;
# convert a CODEPOINT to an INT, adjust to suit the CODEPOINT mode #
OP INTOF = ( CODEPOINT p )INT: p;
# mode to hold the results of unescaping a string #
MODE UNESCAPED = STRUCT( [ 1 : 0 ]CODEPOINT utf32, STRING error );
# high-surrogate range #
CODEPOINT high surrogate min = ABS 16rd800;
CODEPOINT high surrogate max = ABS 16rdbff;
# low-surrogate range #
CODEPOINT low surrogate min = ABS 16rdc00;
CODEPOINT low surrogate max = ABS 16rdfff;
# modulus to extract the surrogate value #
CODEPOINT surrogate base = ABS 16r03ff + 1;
# returns the hex-digit corresponding to c, or -1 if it is invalid #
OP HEXDIGIT = ( CHAR c )INT:
IF c >= "0" AND c <= "9" THEN ABS c - ABS "0"
ELIF c >= "a" AND c <= "f" THEN 10 + ( ABS c - ABS "a" )
ELIF c >= "A" AND c <= "F" THEN 10 + ( ABS c - ABS "A" )
ELSE -1
FI # HEXDIGIT # ;
# returns a string containing the codepoint t, a single UTF-32 character #
# if that is what CHAR contains or encooded as UTF-8 bytes #
OP CHAROF = ( CODEPOINT t )STRING:
IF using unicode THEN REPR t
ELIF t <= ABS 16r00007f THEN REPR INTOF t
ELIF t <= ABS 16r0007ff THEN REPR ( 192 + INTOF ( t OVER 64 ) )
+ REPR ( 128 + INTOF ( t MOD 64 ) )
ELIF t <= ABS 16r007fff THEN
CODEPOINT b1 := t;
CODEPOINT b3 := b1 MOD 64; b1 OVERAB 64;
CODEPOINT b2 := b1 MOD 64; b1 OVERAB 64;
( REPR INTOF ( 224 + b1 ) + REPR INTOF ( 128 + b2 ) + REPR INTOF ( 128 + b3 ) )
ELIF t <= ABS 16r10ffff THEN
CODEPOINT b1 := t;
CODEPOINT b4 := b1 MOD 64; b1 OVERAB 64;
CODEPOINT b3 := b1 MOD 64; b1 OVERAB 64;
CODEPOINT b2 := b1 MOD 64; b1 OVERAB 64;
( REPR INTOF ( 240 + b1 ) + REPR INTOF ( 128 + b2 )
+ REPR INTOF ( 128 + b3 ) + REPR INTOF ( 128 + b4 )
)
ELSE
print( ( "Invalid code point: ", whole( t, 0 ), newline ) );
REPR INTOF ( t MOD max abs char )
FI # CHAROF # ;
# converts s to UTF-32, error messages are returned in error #
PROC to utf32 = ( STRING s )UNESCAPED:
BEGIN
[ 1 : LENGTH s ]CODEPOINT utf32;
STRING error := "";
INT s pos := LWB s;
INT s max := UPB s;
INT u pos := 0;
WHILE s pos <= s max AND error = "" DO
CHAR c := s[ s pos ];
IF c /= "\" THEN # not an escape #
INT ich = ABS c;
INT additional bytes := 0;
IF using unicode THEN # ich is a codepoint #
utf32[ u pos +:= 1 ] := ich
ELIF ich >= 240 THEN # 4-byte UTF-8 character #
utf32[ u pos +:= 1 ] := ich MOD 8;
additional bytes := 3
ELIF ich >= 224 THEN # 3-byte UTF-8 character #
utf32[ u pos +:= 1 ] := ich MOD 16;
additional bytes := 2
ELIF ich >= 192 THEN # 2-byte UTF-8 character #
utf32[ u pos +:= 1 ] := ich MOD 32;
additional bytes := 1
ELSE # 1-byte character #
utf32[ u pos +:= 1 ] := ABS c
FI;
IF s pos + additional bytes > s max THEN
error := "Truncated UTF-8 sequence"
ELSE
TO additional bytes DO utf32[ u pos ] *:= 64 +:= ABS s[ s pos +:= 1 ] MOD 64 OD
FI
ELIF s pos >= UPB s THEN # nothing after the \ #
error := "string ends with ""\"""
ELIF c := s[ s pos +:= 1 ]; # have an escape sequence #
c = "\" OR c = """" OR c = "/"
THEN utf32[ u pos +:= 1 ] := ABS c
ELIF c = "b" THEN utf32[ u pos +:= 1 ] := 8
ELIF c = "f" THEN utf32[ u pos +:= 1 ] := 12
ELIF c = "n" THEN utf32[ u pos +:= 1 ] := 10
ELIF c = "r" THEN utf32[ u pos +:= 1 ] := 13
ELIF c = "t" THEN utf32[ u pos +:= 1 ] := 9
ELIF c = "u" THEN # UTF-16 character #
CODEPOINT u := 0;
IF s pos + 4 > UPB s THEN
# insufficient characters left in the string #
error := "Missing or truncated hex-digits after \u"
ELSE
# have four possible hex digits #
TO 4 WHILE error = "" DO
c := s[ s pos +:= 1 ];
IF INT digit = HEXDIGIT c;
digit < 0 OR digit > 15
THEN
error := "Invalid hex digit: """ + c + """"
ELSE
u *:= 16 +:= digit
FI
OD
FI;
IF error /= "" THEN
SKIP # the string is invalid #
ELIF u >= high surrogate min
AND u <= high surrogate max
THEN
# have a UTF-32 character - must be followed by the #
# low surrogate #
IF s pos + 6 > UPB s THEN
error := "Missing or truncated low-surrogate"
ELIF s[ s pos +:= 1 ] /= "\"
OR s[ s pos +:= 1 ] /= "u"
THEN
error := "Expected ""\u"" after a high-surrogate"
ELSE
CODEPOINT ls := 0;
TO 4 WHILE error = "" DO
c := s[ s pos +:= 1 ];
IF INT digit = HEXDIGIT c;
digit < 0 OR digit > 15
THEN
error := "Invalid hex digit: """ + c + """"
ELSE
ls *:= 16 +:= digit
FI
OD;
IF ls < low surrogate min
OR ls > low surrogate max
THEN
error := "High-surrogate not followed by a low-surrogate"
ELSE
ls MODAB surrogate base;
u MODAB surrogate base *:= surrogate base +:= ls +:= ABS 16r10000
FI
FI;
utf32[ u pos +:= 1 ] := u
ELIF u >= low surrogate min
AND u <= low surrogate max
THEN
error := "Low surrogate not preceded by a high surrogate"
ELSE
utf32[ u pos +:= 1 ] := u
FI
ELSE # invalid escape #
error := "Invalid escape sequence: ""\" + c + """"
FI;
IF error /= "" THEN # error detected - add the position #
error +:= ", at " + whole( s pos, 0 )
FI;
s pos +:= 1
OD;
( utf32[ 1 : u pos ], error )
END # to utf32 # ;
# returns s left-padded with blanks to at least w characters #
PRIO PAD = 1;
OP PAD = ( INT w, STRING s )STRING:
IF LENGTH s >= w THEN s ELSE ( ( w - LENGTH s ) * " " ) + s FI;
# test cases #
[]STRING tests = ( "abc" # abc No escape sequences #
, "a☺c" # a☺c No escape sequences, extended unicode #
, "a\""c" # a"c Two-character escape #
, "\u0061\u0062\u0063" # abc Just escape sequences #
, "a\\c" # a\c Two-character escape for reverse solidus #
, "a\u263Ac" # a☺c \u escape, basic multilingual plane #
, "a\\u263Ac" # a\u263Ac Escaped \ before u #
, "a𝄞c" # a𝄞c No escape sequences, non-BMP character #
, "a\uD834\uDD1Ec" # a𝄞c \u escape, surrogate pair #
, "a\ud834\udd1ec" # a𝄞c Lower case hex digits #
, "a\u263" # error Incomplete escape sequence #
, "a\u263Xc" # error Invalid hexadecimal digit #
, "a\uDD1Ec" # error Lone low surrogate #
, "a\uD834c" # error Lone high surrogate #
, "a\uD834\u263Ac" # error High surrogate followed by #
# non-surrogate #
);
FOR t pos FROM LWB tests TO UPB tests DO
STRING test = tests[ t pos ];
UNESCAPED result = to utf32( test );
STRING error = error OF result;
print( ( 20 PAD test, ": " ) );
IF error /= "" THEN
print( ( error, newline ) )
ELSE
[]CODEPOINT utf32 = utf32 OF result;
print( ( "(" ) ); # show the codepoints as integers #
FOR u pos FROM LWB utf32 TO UPB utf32 DO print( ( " ", whole( utf32[ u pos ], 0 ) ) ) OD;
IF NOT using unicode THEN # Unicode is not supported - show UTF-8 bytes #
print( ( " |" ) );
FOR u pos FROM LWB utf32 TO UPB utf32 DO
STRING chars = CHAROF utf32[ u pos ];
FOR c pos FROM LWB chars TO UPB chars DO
print( ( " ", whole( ABS chars[ c pos ], 0 ) ) )
OD
OD
FI;
print( ( " ) """ ) );
# show the result as a string #
FOR u pos FROM LWB utf32 TO UPB utf32 DO
print( ( CHAROF utf32[ u pos ] ) )
OD;
print( ( """", newline ) )
FI
OD
END
- Output:
From Algol 68G on TIO.RUN
abc: ( 97 98 99 | 97 98 99 ) "abc" a☺c: ( 97 9786 99 | 97 226 152 186 99 ) "a☺c" a\"c: ( 97 34 99 | 97 34 99 ) "a"c" \u0061\u0062\u0063: ( 97 98 99 | 97 98 99 ) "abc" a\\c: ( 97 92 99 | 97 92 99 ) "a\c" a\u263Ac: ( 97 9786 99 | 97 226 152 186 99 ) "a☺c" a\\u263Ac: ( 97 92 117 50 54 51 65 99 | 97 92 117 50 54 51 65 99 ) "a\u263Ac" a𝄞c: ( 97 119070 99 | 97 240 157 132 158 99 ) "a𝄞c" a\uD834\uDD1Ec: ( 97 119070 99 | 97 240 157 132 158 99 ) "a𝄞c" a\ud834\udd1ec: ( 97 119070 99 | 97 240 157 132 158 99 ) "a𝄞c" a\u263: Missing or truncated hex-digits after \u, at 3 a\u263Xc: Invalid hex digit: "X", at 7 a\uDD1Ec: Low surrogate not preceded by a high surrogate, at 7 a\uD834c: Missing or truncated low-surrogate, at 7 a\uD834\u263Ac: High-surrogate not followed by a low-surrogate, at 13
APL
unescape←{
de←{⎕SIGNAL⊂('EN'11)('Message'(∊⍵))}
his←≥∘55296∧≤∘56319
los←≥∘56320∧≤∘57343
hex←{
16∧.>c←¯1+(16↑⎕D,⎕A)⍳1⎕C⍵:16⊥c
de'Invalid hexadecimal digit: ',⍵
}
0=≢⍵:⍵
'\'≠c←⊃⍵:c,∇1↓⍵
9>i←'"\/bfnrt'⍳t←⊃1↓⍵:(⎕UCS 34 92 47 8 12 10 13 9[i]),∇2↓⍵
'u'≠t:t,∇2↓⍵
6>≢⍵:de'Incomplete escape sequence'
(los⍱his)m←hex 2↓6↑⍵:(⎕UCS m),∇6↓⍵
'\u'≢2↑nx←6↓⍵:de'Lone ',('low' 'high'[1+his m]),' surrogate'
6>≢nx:de'Incomplete escape sequence'
(los⍱his)t←hex 2↓6↑nx:de'Low' 'High'[1+his m],' surrogate followed by non-surrogate'
(los⍲.∨his)m t:de'Invalid surrogate pair'
m t←⌽⍣(los m)⊢m t
(⎕UCS 65536+(1024×m-55296)+t-56320),∇12↓⍵
}
- Output:
unescape 'abc' abc unescape 'a☺c' a☺c unescape 'a\"c' a"c unescape '\u0061\u0062\u0063' abc unescape 'a\\c' a\c unescape 'a\u263Ac' a☺c unescape 'a\\u263Ac' a\u263Ac unescape 'a\uD834\uDD1Ec' a𝄞c unescape 'a\ud834\udd1ec' a𝄞c unescape 'a\u263' DOMAIN ERROR: Incomplete escape sequence unescape 'a\u263Xc' DOMAIN ERROR: Invalid hexadecimal digit: 263X unescape 'a\uDD1Ec' DOMAIN ERROR: Lone low surrogate unescape 'a\uD834c' DOMAIN ERROR: Lone high surrogate unescape 'a\uD834\u263Ac' DOMAIN ERROR: High surrogate followed by non-surrogate
C++
#include <exception>
#include <iostream>
#include <string> // c++17 for string_view
#include <vector>
class UnescapeError : public std::exception {
public:
UnescapeError(std::string_view message) : m_message{message} {};
const char* what() const noexcept override { return m_message.c_str(); };
private:
std::string m_message{};
};
std::int32_t parse_hex_digits(std::string_view digits) {
std::int32_t code_point{};
for (const auto digit : digits) {
code_point <<= 4;
switch (digit) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
code_point |= (digit - '0');
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
code_point |= (digit - 'a' + 10);
break;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
code_point |= (digit - 'A' + 10);
break;
default:
throw UnescapeError("invalid \\uXXXX escape");
}
}
return code_point;
}
bool is_high_surrogate(std::int32_t code_point) {
return code_point >= 0xD800 && code_point <= 0xDBFF;
}
bool is_low_surrogate(std::int32_t code_point) {
return code_point >= 0xDC00 && code_point <= 0xDFFF;
}
std::string encode_utf8(std::int32_t code_point) {
std::string rv;
if (code_point <= 0x7F) {
// Single-byte UTF-8
rv += static_cast<char>(code_point & 0x7F);
} else if (code_point <= 0x7FF) {
// Two-byte UTF-8
rv += static_cast<char>(0xC0 | ((code_point >> 6) & 0x1F));
rv += static_cast<char>(0x80 | (code_point & 0x3F));
} else if (code_point <= 0xFFFF) {
// Three-byte UTF-8
rv += static_cast<char>(0xE0 | ((code_point >> 12) & 0x0F));
rv += static_cast<char>(0x80 | ((code_point >> 6) & 0x3F));
rv += static_cast<char>(0x80 | (code_point & 0x3F));
} else if (code_point <= 0x10FFFF) {
// Four-byte UTF-8
rv += static_cast<char>(0xF0 | ((code_point >> 18) & 0x07));
rv += static_cast<char>(0x80 | ((code_point >> 12) & 0x3F));
rv += static_cast<char>(0x80 | ((code_point >> 6) & 0x3F));
rv += static_cast<char>(0x80 | (code_point & 0x3F));
} else {
throw UnescapeError("invalid code point");
}
return rv;
}
std::string unescape_json_string(std::string_view sv) {
std::string rv{};
unsigned char byte{}; // current byte
std::int32_t code_point; // decoded \uXXXX or \uXXXX\uXXXX escape sequence
std::string::size_type index{0}; // current byte index in sv
std::string::size_type length{sv.length()};
while (index < length) {
byte = sv[index++];
if (byte == '\\') {
if (index < length) {
byte = sv[index++];
} else {
throw UnescapeError("invalid escape");
}
switch (byte) {
case '"':
rv.push_back('"');
break;
case '\\':
rv.push_back('\\');
break;
case '/':
rv.push_back('/');
break;
case 'b':
rv.push_back('\b');
break;
case 'f':
rv.push_back('\f');
break;
case 'n':
rv.push_back('\n');
break;
case 'r':
rv.push_back('\r');
break;
case 't':
rv.push_back('\t');
break;
case 'u':
// Decode 4 hex digits.
if (index + 4 > length) {
throw UnescapeError("invalid \\uXXXX escape");
}
code_point = parse_hex_digits(sv.substr(index, 4));
index += 4;
if (is_low_surrogate(code_point)) {
throw UnescapeError("unexpected low surrogate code point");
}
if (is_high_surrogate(code_point)) {
if (!(index + 6 <= length && sv[index] == '\\' &&
sv[index + 1] == 'u')) {
throw UnescapeError("incomplete escape sequence");
}
std::int32_t low_surrogate =
parse_hex_digits(sv.substr(index + 2, 4));
index += 6;
if (!is_low_surrogate(low_surrogate)) {
throw UnescapeError("unexpected code point");
}
// Combine high and low surrogates into a Unicode code point.
code_point = 0x10000 + (((code_point & 0x03FF) << 10) |
(low_surrogate & 0x03FF));
}
rv.append(encode_utf8(code_point));
break;
default:
throw UnescapeError("invalid escape");
}
} else {
// Find invalid characters.
// Bytes that are less than 0x1f and not a continuation byte.
if ((byte & 0x80) == 0) {
// Single-byte code point
if (byte <= 0x1F) {
throw UnescapeError("invalid character");
}
rv.push_back(byte);
} else if ((byte & 0xE0) == 0xC0) {
// Two-byte code point
if (index + 1 > length) {
throw UnescapeError("invalid code point");
}
rv.push_back(byte);
rv.push_back(sv[index++]);
} else if ((byte & 0xF0) == 0xE0) {
// Three-byte code point
if (index + 2 > length) {
throw UnescapeError("invalid code point");
}
rv.push_back(byte);
rv.push_back(sv[index++]);
rv.push_back(sv[index++]);
} else if ((byte & 0xF8) == 0xF0) {
// Four-byte code point
if (index + 3 > length) {
throw UnescapeError("invalid code point");
}
rv.push_back(byte);
rv.push_back(sv[index++]);
rv.push_back(sv[index++]);
rv.push_back(sv[index++]);
} else {
throw UnescapeError("invalid character");
}
}
}
return rv;
}
const std::vector<std::string> TEST_CASES = {
"abc",
"a\xE2\x98\xba" "c",
"a\\\"c",
"\\u0061\\u0062\\u0063",
"a\\\\c",
"a\\u263Ac",
"a\\\\u263Ac",
"a\\uD834\\uDD1Ec",
"a\\ud834\\udd1ec",
"a\\u263",
"a\\u263Xc",
"a\\uDD1Ec",
"a\\uD834c",
"a\\uD834\\u263Ac",
};
int main(int argc, char const* argv[]) {
for (const auto& str : TEST_CASES) {
try {
auto unescaped{unescape_json_string(str)};
std::cout << str << " -> " << unescape_json_string(str) << std::endl;
} catch (const UnescapeError& e) {
std::cout << str << " -> " << e.what() << std::endl;
}
}
return 0;
}
- Output:
abc -> abc a☺c -> a☺c a\"c -> a"c \u0061\u0062\u0063 -> abc a\\c -> a\c a\u263Ac -> a☺c a\\u263Ac -> a\u263Ac a\uD834\uDD1Ec -> a𝄞c a\ud834\udd1ec -> a𝄞c a\u263 -> invalid \uXXXX escape a\u263Xc -> invalid \uXXXX escape a\uDD1Ec -> unexpected low surrogate code point a\uD834c -> incomplete escape sequence a\uD834\u263Ac -> unexpected code point
EasyLang
func$ unescape val$ .
subr tonum
h$ = "0x"
for ind = ind to ind + 3
h$ &= val$[ind]
.
v = number h$
if error = 1
print "error: no hex number"
return ""
.
.
#
val$[] = strchars val$
ind = 1
while ind <= len val$[]
c$ = val$[ind]
ind += 1
if c$ = "\\"
if ind > len val$[]
print "error: expected escape character"
return ""
.
c$ = val$[ind]
ind += 1
if c$ = "\"" or c$ = "\\" or c$ = "/"
r$ &= c$
elif c$ = "b"
r$ &= strchar 0x08
elif c$ = "f"
r$ &= strchar 0x0c
elif c$ = "n"
r$ &= "\n"
elif c$ = "r"
r$ &= strchar 0x0d
elif c$ = "t"
r$ &= strchar 0x09
elif c$ = "u"
if ind + 3 > len val$[]
print "error: unexpected end"
return ""
.
tonum
if v >= 0xdc00 and v <= 0xdfff
print "error: unexpected low surrogate code point"
return ""
.
if v >= 0xd800 and v <= 0xdbff
vh = v
if ind + 6 > len val$[]
print "error: unexpected end"
return ""
.
if val$[ind] & val$[ind + 1] <> "\\u"
print "error: expected \\u"
return ""
.
ind += 2
tonum
if not (v >= 0xdc00 and v <= 0xdfff)
print "error: expected low surrogate code point"
return ""
.
v = 0x10000 + bitor bitshift bitand vh 0x03ff 10 bitand v 0x03ff
.
r$ &= strchar v
else
print "error: expected escape character"
return ""
.
else
r$ &= c$
.
.
return r$
.
repeat
s$ = input
until s$ = ""
write s$ & " --> "
r$ = unescape s$
if r$ <> ""
print r$
.
.
#
input_data
abc
a☺c
a\"c
\u0061\u0062\u0063
a\\c
a\u263Ac
a\\u263Ac<c/ode>
a\uD834\uDD1Ec
a\ud834\udd1ec
a\u263
a\u263Xc
a\uDD1Ec
a\uD834c
a\uD834\u263Ac
- Output:
abc --> abc a☺c --> a☺c a\"c --> a"c \u0061\u0062\u0063 --> abc a\\c --> a\c a\u263Ac --> a☺c a\\u263Ac<c/ode> --> a\u263Ac<c/ode> a\uD834\uDD1Ec --> a𝄞c a\ud834\udd1ec --> a𝄞c a\u263 --> error: unexpected end a\u263Xc --> error: no hex number a\uDD1Ec --> error: unexpected low surrogate code point a\uD834c --> error: unexpected end a\uD834\u263Ac --> error: expected low surrogate code point
F#
// Unescape a string. Nigel Galloway: August 13th., 2024
let tests=["abc";"a☺c";"a\"c";"\u0061\u0062\u0063";"a\\c";"a\\u263Ac";"a\uD834\uDD1Ec";"a\ud834\udd1ec";"a\u263";"a\u263Xc";"a\uDD1Ec";"a\uD834c";"a\uD834c";"a\uD834\u263Ac"]
tests|>Seq.iter(System.Text.RegularExpressions.Regex.Unescape>>printfn "%s")
- Output:
abc a☺c a"c abc a\c a\u263Ac a𝄞c a𝄞c a\u263 a\u263Xc a�c a�c a�c a�☺c
FreeBASIC
Type DecodeResult
codepoint As Integer
index As Integer
End Type
Function isHighSurrogate (Byval codepoint As Integer) As Boolean
Return codepoint >= &HD800 Andalso codepoint <= &HDBFF
End Function
Function isLowSurrogate (Byval codepoint As Integer) As Boolean
Return codepoint >= &HDC00 Andalso codepoint <= &HDFFF
End Function
Function parseHexDigits(Byref digits As String, Byref index As Integer) As Integer
Dim As Integer i, codepoint
Dim As Ubyte digit
codepoint = 0
For i = 1 To Len(digits)
digit = Asc(Lcase(Mid(digits, i, 1)))
codepoint Shl= 4
If digit >= Asc("0") And digit <= Asc("9") Then
codepoint += digit - Asc("0")
Elseif digit >= Asc("a") And digit <= Asc("f") Then
codepoint += digit - Asc("a") + 10
Else
Print "invalid \uXXXX escape sequence, at index " & index
Return -1
End If
Next
Return codepoint
End Function
Function decodeHexChar(Byref value As String, Byref index As Integer) As DecodeResult
Dim As Integer length, lowSurrogate
Dim As DecodeResult result
length = Len(value)
If index + 4 > length Then
Print "incomplete escape sequence, at index " & (index - 2)
result.codepoint = -1
Return result
End If
index += 1 ' Move past 'u'
result.codepoint = parseHexDigits(Mid(value, index, 4), index - 3)
If isLowSurrogate(result.codepoint) Then
Print "unexpected low surrogate code point, at index " & (index - 3)
result.codepoint = -1
Return result
End If
If isHighSurrogate(result.codepoint) Then
If Not (index + 9 < length And Mid(value, index + 4, 2) = "\u") Then
Print "incomplete escape sequence, at index " & (index - 3)
result.codepoint = -1
Return result
End If
lowSurrogate = parseHexDigits(Mid(value, index + 6, 4), index + 4)
If Not isLowSurrogate(lowSurrogate) Then
Print "unexpected code point, at index " & (index + 3)
result.codepoint = -1
Return result
End If
result.codepoint = &H10000 + ((result.codepoint And &H03FF) Shl 10) + (lowSurrogate And &H03FF)
result.index = index + 9
Return result
End If
result.index = index + 3
Return result
End Function
Function stringFromCodepoint(Byval codepoint As Integer, Byval index As Integer) As String
If codepoint <= &H1F Then
Print "invalid character, at index " & index
Return ""
End If
Return Chr(codepoint)
End Function
Function unescapeString(Byref value As String) As String
Dim As String res
Dim As Integer length, index, ch, startIndex
Dim As DecodeResult result
res = ""
length = Len(value)
index = 1
While index <= length
ch = Asc(Mid(value, index, 1))
If ch = Asc("\") Then
index += 1 ' Move past '\'
ch = Asc(Mid(value, index, 1))
Select Case ch
Case Asc(""""), Asc("\"), Asc("/")
res &= Chr(ch)
Case Asc("b")
res &= Chr(8)
Case Asc("f")
res &= Chr(12)
Case Asc("n")
res &= Chr(10)
Case Asc("r")
res &= Chr(13)
Case Asc("t")
res &= Chr(9)
Case Asc("u")
startIndex = index - 1
result = decodeHexChar(value, index)
If result.codepoint = -1 Then Return res
res &= stringFromCodepoint(result.codepoint, startIndex)
index = result.index
Case Else
Print "unknown escape sequence, at index " & (index - 1)
Return res
End Select
Else
res &= Chr(ch)
End If
index += 1
Wend
Return res
End Function
Dim As String test(13) = { _
"abc", "a?c", "a\""c", "\u0061\u0062\u0063", "a\\c", "a\u263Ac", _
"a\\u263Ac", "a\uD834\uDD1Ec", "a\ud834\udd1ec", "a\u263", "a\u263Xc", _
"a\uDD1Ec", "a\uD834c", "a\uD834\u263Ac" }
For i As Integer = 0 To Ubound(test)
Print test(i) & " -> ";
Dim As String result = unescapeString(test(i))
If result <> "a" Then Print result
Next
Sleep
- Output:
abc -> abc a?c -> a?c a\"c -> a"c \u0061\u0062\u0063 -> abc a\\c -> a\c a\u263Ac -> a:c a\\u263Ac -> a\u263Ac a\uD834\uDD1Ec -> a▲c a\ud834\udd1ec -> a▲c a\u263 -> incomplete escape sequence, at index 1 a\u263Xc -> invalid \uXXXX escape sequence, at index 1 a\uDD1Ec -> unexpected low surrogate code point, at index 1 a\uD834c -> incomplete escape sequence, at index 1 a\uD834\u263Ac -> unexpected code point, at index 7
jq
Works with jq, the C implementation of jq
Works with jaq, the Rust implementation of jq
To transform JSON strings to raw text using jq, one has only to use the -r command-line option. This is true for the C, Go and Rust implementations, as exemplified by the following transcript using gojq:
$ echo '"\u0061\u0062\u0063"' | gojq -r . abc
The present task, however, also requires handling strings that are not quite valid as JSON strings. This can, at least in principle, be accomplished using jq's exception-handling mechanism, and indeed, using the C implementation of jq and assuming the text shown in the first column of the task description is in a file named unescape.txt, we find that:
< unescape.txt sed -e 's/^/"/' -e 's/$/"/' | jq -nr 'recurse( try inputs catch . ) | select(.)'
yields:
abc a☺c a"c abc a\c a☺c a\u263Ac a𝄞c a𝄞c Invalid \uXXXX escape at line 10, column 8 Invalid characters in \uXXXX escape at line 10, column 18 a�c Invalid \uXXXX\uXXXX surrogate pair escape at line 11, column 10 Invalid \uXXXX\uXXXX surrogate pair escape at line 11, column 26
Notice that an error condition is not raised for the string "a\uDD1Ec". The JSON linter at https://jsonlint.com/ agrees that this is "valid JSON" so we won't pursue the matter further except to note that jaq, the Rust implementation, flags "a\uDD1Ec" as an error:
echo '"a\uDD1Ec"' | jaq -r . Error: failed to parse: invalid character with index 56606
Otherwise, jaq produces similar results to those shown above.
However, gojq, the Go implementation, handles non-JSON strings differently.
JavaScript
class UnescapeError extends Error {
constructor(message, index) {
super(message);
this.name = "UnescapeError";
this.index = index;
this.message = `${message}, at index ${index}`;
}
}
function unescapeString(value) {
const rv = [];
const length = value.length;
let index = 0;
let startIndex = 0;
let codepoint;
while (index < length) {
const ch = value[index];
if (ch === "\\") {
index += 1; // Move past '\'
switch (value[index]) {
case '"':
rv.push('"');
break;
case "\\":
rv.push("\\");
break;
case "/":
rv.push("/");
break;
case "b":
rv.push("\x08");
break;
case "f":
rv.push("\x0C");
break;
case "n":
rv.push("\n");
break;
case "r":
rv.push("\r");
break;
case "t":
rv.push("\t");
break;
case "u":
startIndex = index - 1;
[codepoint, index] = decodeHexChar(value, index);
rv.push(stringFromCodePoint(codepoint, startIndex));
break;
default:
throw new UnescapeError(`unknown escape sequence`, index - 1);
}
} else {
rv.push(ch);
}
index += 1;
}
return rv.join("");
}
function decodeHexChar(value, index) {
const length = value.length;
if (index + 4 >= length) {
throw new UnescapeError(`incomplete escape sequence`, index - 1);
}
index += 1; // Move past 'u'
let codepoint = parseHexDigits(value.slice(index, index + 4), index - 2);
if (isLowSurrogate(codepoint)) {
throw new UnescapeError(`unexpected low surrogate code point`, index - 2);
}
if (isHighSurrogate(codepoint)) {
if (!(index + 9 < length && value[index + 4] === "\\" && value[index + 5] === "u")) {
throw new UnescapeError(`incomplete escape sequence`, index - 2);
}
const lowSurrogate = parseHexDigits(value.slice(index + 6, index + 10), index + 4);
if (!isLowSurrogate(lowSurrogate)) {
throw new UnescapeError(`unexpected code point`, index + 4);
}
codepoint = 0x10000 + (((codepoint & 0x03ff) << 10) | (lowSurrogate & 0x03ff));
return [codepoint, index + 9];
}
return [codepoint, index + 3];
}
function parseHexDigits(digits, index) {
let codepoint = 0;
for (const digit of digits) {
codepoint <<= 4;
if (digit >= '0' && digit <= '9') {
codepoint |= digit.charCodeAt(0) - '0'.charCodeAt(0);
} else if (digit >= 'A' && digit <= 'F') {
codepoint |= digit.charCodeAt(0) - 'A'.charCodeAt(0) + 10;
} else if (digit >= 'a' && digit <= 'f') {
codepoint |= digit.charCodeAt(0) - 'a'.charCodeAt(0) + 10;
} else {
throw new UnescapeError("invalid \\uXXXX escape sequence", index);
}
}
return codepoint;
}
function stringFromCodePoint(codepoint, index) {
if (codepoint === undefined || codepoint <= 0x1f) {
throw new UnescapeError("invalid character", index);
}
try {
return String.fromCodePoint(codepoint);
} catch {
throw new UnescapeError("invalid escape sequence", index);
}
}
function isHighSurrogate(codepoint) {
return codepoint >= 0xd800 && codepoint <= 0xdbff;
}
function isLowSurrogate(codepoint) {
return codepoint >= 0xdc00 && codepoint <= 0xdfff;
}
const testCases = [
"abc",
"a?c",
'a\\"c',
"\\u0061\\u0062\\u0063",
"a\\\\c",
"a\\u263Ac",
"a\\\\u263Ac",
"a\\uD834\\uDD1Ec",
"a\\ud834\\udd1ec",
"a\\u263",
"a\\u263Xc",
"a\\uDD1Ec",
"a\\uD834c",
"a\\uD834\\u263Ac",
];
for (const s of testCases) {
try {
console.log(`${s} -> ${unescapeString(s)}`);
} catch (err) {
if (err instanceof UnescapeError) {
console.log(`${s} -> ${err.message}`);
} else {
throw err;
}
}
}
Same as TypeScript entry.
Julia
struct UnescapeError <: Exception
message::String
index::Int
end
function unescape_string(value)
rv = Char[]
len = length(value)
index = 1
start_index = 0
while index <= len
ch = value[index]
if ch == '\\'
index += 1 # Move past '\'
if index > len
throw(UnescapeError("incomplete escape sequence", index - 1))
end
if value[index] == '"'
push!(rv, '"')
elseif value[index] == '\\'
push!(rv, '\\')
elseif value[index] == '/'
push!(rv, '/')
elseif value[index] == 'b'
push!(rv, '\x08')
elseif value[index] == 'f'
push!(rv, '\x0C')
elseif value[index] == 'n'
push!(rv, '\n')
elseif value[index] == 'r'
push!(rv, '\r')
elseif value[index] == 't'
push!(rv, '\t')
elseif value[index] == 'u'
start_index = index - 1
codepoint, index = decode_hex_char(value, index)
push!(rv, string_from_code_point(codepoint, start_index))
else
throw(UnescapeError("unknown escape sequence", index - 1))
end
else
push!(rv, ch)
end
index += 1
end
return String(rv)
end
function decode_hex_char(value, index)
len = length(value)
if index + 4 > len
throw(UnescapeError("incomplete escape sequence", index - 1))
end
index += 1 # Move past 'u'
codepoint = parse_hex_digits(value[index:index+3], index - 2)
if is_low_surrogate(codepoint)
throw(UnescapeError("unexpected low surrogate code point", index - 2))
end
if is_high_surrogate(codepoint)
if !(index + 9 <= len && value[index+4] == '\\' && value[index+5] == 'u')
throw(UnescapeError("incomplete escape sequence", index - 2))
end
low_surrogate = parse_hex_digits(value[index+6:index+9], index + 4)
if !is_low_surrogate(low_surrogate)
throw(UnescapeError("unexpected code point", index + 4))
end
codepoint = 0x10000 + (((codepoint & 0x03ff) << 10) | (low_surrogate & 0x03ff))
return codepoint, index + 9
end
return codepoint, index + 3
end
function parse_hex_digits(hexdigits, index)
codepoint = zero(UInt32)
for digit in hexdigits
codepoint <<= 4
if '0' <= digit <= '9'
codepoint |= UInt32(digit) - UInt32('0')
elseif 'A' <= digit <= 'F'
codepoint |= UInt32(digit) - UInt32('A') + 10
elseif 'a' <= digit <= 'f'
codepoint |= UInt32(digit) - UInt32('a') + 10
else
throw(UnescapeError("invalid \\uXXXX escape sequence from $hexdigits", index))
end
end
return codepoint
end
function string_from_code_point(codepoint, index)
if codepoint isa Nothing || codepoint <= 0x1f
throw(UnescapeError("invalid character", index))
end
try
return Char(codepoint)
catch err
throw(UnescapeError("invalid escape sequence", index))
end
end
is_high_surrogate(codepoint) = 0xd800 <= codepoint <= 0xdbff
is_low_surrogate(codepoint) = 0xdc00 <= codepoint <= 0xdfff
# Test cases
test_cases = [
"abc",
"a?c",
"""a\\"c""",
"\\u0061\\u0062\\u0063",
"a\\\\c",
"a\\u263Ac",
"a\\\\u263Ac",
"a\\uD834\\uDD1Ec",
"a\\ud834\\udd1ec",
"a\\u263",
"a\\u263Xc",
"a\\uDD1Ec",
"a\\uD834c",
"a\\uD834\\u263Ac",
]
for s in test_cases
try
println(s, " -> ", unescape_string(s))
catch err
println(s, " -> $(err.message), at index $(err.index - 1)")
end
end
- Output:
Same as Typescript example.
Nu
def unescape [] {
$'"($in)"' | from json -s
}
# test
[
'abc'
'a☺c'
'a\"c'
'\u0061\u0062\u0063'
'a\\c'
'a\u263Ac'
'a\\u263Ac'
'a\uD834\uDD1Ec'
'a\ud834\udd1ec'
'a"c'
'a\u263'
'a\u263Xc'
'a\uDD1Ec'
'a\uD834c'
'a\uD834\u263Ac'
]
| each {
{input: $in result: ($in | try { unescape } catch { '<error>' })}
}
- Output:
╭────┬────────────────────┬──────────╮ │ # │ input │ result │ ├────┼────────────────────┼──────────┤ │ 0 │ abc │ abc │ │ 1 │ a☺c │ a☺c │ │ 2 │ a\"c │ a"c │ │ 3 │ \u0061\u0062\u0063 │ abc │ │ 4 │ a\\c │ a\c │ │ 5 │ a\u263Ac │ a☺c │ │ 6 │ a\\u263Ac │ a\u263Ac │ │ 7 │ a\uD834\uDD1Ec │ a𝄞c │ │ 8 │ a\ud834\udd1ec │ a𝄞c │ │ 9 │ a"c │ <error> │ │ 10 │ a\u263 │ <error> │ │ 11 │ a\u263Xc │ <error> │ │ 12 │ a\uDD1Ec │ <error> │ │ 13 │ a\uD834c │ <error> │ │ 14 │ a\uD834\u263Ac │ <error> │ ╰────┴────────────────────┴──────────╯
Phix
with javascript_semantics
function parse_hex_digits(string digits, integer i)
integer codepoint = 0;
for digit in lower(digits) do
codepoint *= #10
if digit>='0' and digit<='9' then
codepoint += digit - '0';
elsif digit >= 'a' and digit <= 'f' then
codepoint += digit - 'a' + 10;
else
throw(sprintf(`invalid \\uXXXX escape sequence, at index %d`, {i}));
end if
end for
return codepoint;
end function
function is_low_surrogate(integer codepoint)
return codepoint >= 0xdc00 and codepoint <= 0xdfff;
end function
function is_high_surrogate(integer codepoint)
return codepoint >= 0xd800 and codepoint <= 0xdbff;
end function
function decode_hex_char(string s, integer i)
integer l = length(s);
if i+4>l then
throw(sprintf(`incomplete escape sequence, at index %d`, {i-1}));
end if
i += 1; // Move past 'u'
integer codepoint = parse_hex_digits(s[i..i+3], i-2);
if is_low_surrogate(codepoint) then
throw(sprintf(`unexpected low surrogate code point, at index %d`, {i-2}));
end if
if is_high_surrogate(codepoint) then
if not(i + 9 < l and s[i + 4] == '\\' and s[i + 5] == 'u') then
throw(sprintf(`incomplete escape sequence, at index %d`, {i-2}));
end if
integer low_surrogate = parse_hex_digits(s[i+6..i+9], i+4);
if not is_low_surrogate(low_surrogate) then
throw(sprintf(`unexpected code point, at index %d`, {i+4}));
end if
codepoint = 0x10000 + ((codepoint && 0x03ff) * #400) + (low_surrogate && 0x03ff);
return {codepoint, i+9};
end if
return {codepoint, i+3}
end function
function string_from_codepoint(integer codepoint, i)
if codepoint <= 0x1f then
throw(sprintf(`invalid character, at index %d`, {i}));
end if
// ASIDE: this is the really tricky hidden and somewhat cheaty part,
// that's likely to be rather different in every language/submission.
--
-- So far / at the time of writing:
-- ALGOL 68: CHAROF codepoint; (kind of/I think)
-- APL: your guess is as good as mine
-- C++: full on manual code
-- EasyLang: strchar v
-- F#: n/a (uses Regex.Unescape)
-- FreeBASIC: Chr(codepoint)
-- jq: n/a (uses -r command line option)
-- JavaScript: String.fromCodePoint(codepoint);
-- Julia: Char(codepoint)
-- Nu: n/a (uses from json)
-- Phix: utf32_to_utf8({codepoint})
-- Python: chr(codepoint)
-- "" jbs: full on manual code
-- Raku: n/a (autohandled by "say $o.made", I think)
-- RPL: n/a [no unicode support]
-- TypeScript: as Javascript
-- Wren: String.fromCodePoint(cp)
--
return utf32_to_utf8({codepoint})
end function
function unescape(string s)
string res = ""
integer l = length(s), i=1
while i<=l do
integer ch = s[i];
if ch == '\\' then
i += 1; // Move past '\'
ch = s[i]
switch ch do
case '"','\\','/': res &= ch
case 'b': res &= '\x08'
case 'f': res &= '\x0C'
case 'n': res &= '\n'
case 'r': res &= '\r'
case 't': res &= '\t'
case 'u':
integer startIndex = i - 1, codepoint;
{codepoint, i} = decode_hex_char(s, i);
res &= string_from_codepoint(codepoint, startIndex);
default:
throw(sprintf(`unknown escape sequence, at index %d`, {i-1}));
end switch
else
res &= ch
end if
i += 1;
end while
return res
end function
constant tests = {
"abc",
"a☺c",
"a\\\"c",
"\\u0061\\u0062\\u0063",
"a\\\\c",
"a\\u263Ac",
"a\\\\u263Ac",
"a\\uD834\\uDD1Ec",
"a\\ud834\\udd1ec",
"a\\u263",
"a\\u263Xc",
"a\\uDD1Ec",
"a\\uD834c",
"a\\uD834\\u263Ac",
};
for s in tests do
try
printf(1,"%s -> %s\n",{s,unescape(s)});
catch err
printf(1,"%s -> %s\n",{s,err[E_USER]});
end try
end for
- Output:
Note that indices in Phix are 1-based, so a 2 means the second character, like it jolly well should!
Output on a Windows console is, as ever, somewhat suspect wrt unicode, below is from a browser & should match Linux.
abc -> abc a☺c -> a☺c a\"c -> a"c \u0061\u0062\u0063 -> abc a\\c -> a\c a\u263Ac -> a☺c a\\u263Ac -> a\u263Ac a\uD834\uDD1Ec -> a𝄞c a\ud834\udd1ec -> a𝄞c a\u263 -> incomplete escape sequence, at index 2 a\u263Xc -> invalid \\uXXXX escape sequence, at index 2 a\uDD1Ec -> unexpected low surrogate code point, at index 2 a\uD834c -> incomplete escape sequence, at index 2 a\uD834\u263Ac -> unexpected code point, at index 8
Python
#!/usr/bin/python3
class UnescapeError(Exception):
def __init__(self, message: str, index: int):
super().__init__(f"{message}, at index {index}")
self.message = message
self.index = index
self.name = "UnescapeError"
def unescape_string(value: str) -> str:
rv = []
length = len(value)
index = 0
start_index = 0
while index < length:
ch = value[index]
if ch == "\\":
index += 1 # Move past '\'
if index >= length:
raise UnescapeError("incomplete escape sequence", index - 1)
if value[index] == '"':
rv.append('"')
elif value[index] == "\\":
rv.append("\\")
elif value[index] == "/":
rv.append("/")
elif value[index] == "b":
rv.append("\x08")
elif value[index] == "f":
rv.append("\x0C")
elif value[index] == "n":
rv.append("\n")
elif value[index] == "r":
rv.append("\r")
elif value[index] == "t":
rv.append("\t")
elif value[index] == "u":
start_index = index - 1
codepoint, index = decode_hex_char(value, index)
rv.append(string_from_code_point(codepoint, start_index))
else:
raise UnescapeError("unknown escape sequence", index - 1)
else:
rv.append(ch)
index += 1
return "".join(rv)
def decode_hex_char(value: str, index: int) -> (int, int):
length = len(value)
if index + 4 >= length:
raise UnescapeError("incomplete escape sequence", index - 1)
index += 1 # Move past 'u'
codepoint = parse_hex_digits(value[index:index + 4], index - 2)
if is_low_surrogate(codepoint):
raise UnescapeError("unexpected low surrogate code point", index - 2)
if is_high_surrogate(codepoint):
if not (index + 9 < length and value[index + 4] == "\\" and value[index + 5] == "u"):
raise UnescapeError("incomplete escape sequence", index - 2)
low_surrogate = parse_hex_digits(value[index + 6:index + 10], index + 4)
if not is_low_surrogate(low_surrogate):
raise UnescapeError("unexpected code point", index + 4)
codepoint = 0x10000 + (((codepoint & 0x03ff) << 10) | (low_surrogate & 0x03ff))
return codepoint, index + 9
return codepoint, index + 3
def parse_hex_digits(digits: str, index: int) -> int:
codepoint = 0
for digit in digits:
codepoint <<= 4
if '0' <= digit <= '9':
codepoint |= ord(digit) - ord('0')
elif 'A' <= digit <= 'F':
codepoint |= ord(digit) - ord('A') + 10
elif 'a' <= digit <= 'f':
codepoint |= ord(digit) - ord('a') + 10
else:
raise UnescapeError("invalid \\uXXXX escape sequence", index)
return codepoint
def string_from_code_point(codepoint: int, index: int) -> str:
if codepoint is None or codepoint <= 0x1f:
raise UnescapeError("invalid character", index)
try:
return chr(codepoint)
except ValueError:
raise UnescapeError("invalid escape sequence", index)
def is_high_surrogate(codepoint: int) -> bool:
return 0xd800 <= codepoint <= 0xdbff
def is_low_surrogate(codepoint: int) -> bool:
return 0xdc00 <= codepoint <= 0xdfff
# Test cases
test_cases = [
"abc",
"a☺c",
'a\\"c',
"\\u0061\\u0062\\u0063",
"a\\\\c",
"a\\u263Ac",
"a\\\\u263Ac",
"a\\uD834\\uDD1Ec",
"a\\ud834\\udd1ec",
"a\\u263",
"a\\u263Xc",
"a\\uDD1Ec",
"a\\uD834c",
"a\\uD834\\u263Ac",
]
for s in test_cases:
try:
print(f"{s} -> {unescape_string(s)}")
except UnescapeError as err:
print(f"{s} -> {err}")
- Output:
abc -> abc a☺c -> a☺c a\"c -> a"c \u0061\u0062\u0063 -> abc a\\c -> a\c a\u263Ac -> a☺c a\\u263Ac -> a\u263Ac a\uD834\uDD1Ec -> a𝄞c a\ud834\udd1ec -> a𝄞c a\u263 -> incomplete escape sequence, at index 1 a\u263Xc -> invalid \uXXXX escape sequence, at index 1 a\uDD1Ec -> unexpected low surrogate code point, at index 1 a\uD834c -> incomplete escape sequence, at index 1 a\uD834\u263Ac -> unexpected code point, at index 7
Just byte strings
from typing import Tuple
from typing import ByteString
class UnescapeError(Exception):
pass
def is_high_surrogate(code_point: int) -> bool:
return code_point >= 0xD800 and code_point <= 0xDBFF
def is_low_surrogate(code_point: int) -> bool:
return code_point >= 0xDC00 and code_point <= 0xDFFF
def parse_hex_digits(digits: ByteString) -> int:
code_point = 0
for digit in digits:
code_point <<= 4
if digit >= 48 and digit <= 57:
code_point |= digit - 48
elif digit >= 65 and digit <= 70:
code_point |= digit - 65 + 10
elif digit >= 97 and digit <= 102:
code_point |= digit - 97 + 10
else:
raise UnescapeError("invalid \\uXXXX escape sequence")
return code_point
def utf8_encode(code_point: int) -> ByteString:
_bytes = bytearray()
if code_point <= 0x7F:
_bytes.append(code_point & 0x7F)
elif code_point <= 0x7FF:
_bytes.append(0xC0 | ((code_point >> 6) & 0x1F))
_bytes.append(0x80 | (code_point & 0x3F))
elif code_point <= 0xFFFF:
_bytes.append(0xE0 | ((code_point >> 12) & 0x0F))
_bytes.append(0x80 | ((code_point >> 6) & 0x3F))
_bytes.append(0x80 | (code_point & 0x3F))
elif code_point <= 0x10FFFF:
_bytes.append(0xF0 | ((code_point >> 18) & 0x07))
_bytes.append(0x80 | ((code_point >> 12) & 0x3F))
_bytes.append(0x80 | ((code_point >> 6) & 0x3F))
_bytes.append(0x80 | (code_point & 0x3F))
else:
raise UnescapeError("invalid code point")
return _bytes
def decode_escape_sequence(value: ByteString, index: int) -> Tuple[int, int]:
byte = value[index]
if byte == 0x22:
return 0x22, index
if byte == 0x5C:
return 0x5C, index
if byte == 0x2F:
return 0x2F, index
if byte == 0x62:
return 0x08, index
if byte == 0x66:
return 0x0C, index
if byte == 0x6E:
return 0x0A, index
if byte == 0x72:
return 0x0D, index
if byte == 0x74:
return 0x09, index
if byte == 0x75:
code_point, index = decode_hex_char(value, index)
return code_point, index
raise UnescapeError("unknown escape sequence at")
def decode_hex_char(value: ByteString, index: int) -> Tuple[int, int]:
length = len(value)
if index + 4 >= length:
raise UnescapeError("incomplete escape sequence")
index += 1 # move past 'u'
code_point = parse_hex_digits(value[index : index + 4])
if is_low_surrogate(code_point):
raise UnescapeError("unexpected low surrogate")
if is_high_surrogate(code_point):
# expect a surrogate pair
if not (
index + 9 < length and value[index + 4] == 0x5C and value[index + 5] == 0x75
):
raise UnescapeError("incomplete escape sequence at index")
low_surrogate = parse_hex_digits(value[index + 6 : index + 10])
if not is_low_surrogate(low_surrogate):
raise UnescapeError(
"unexpected code point",
)
code_point = 0x10000 + (
((code_point & 0x03FF) << 10) | (low_surrogate & 0x03FF)
)
return (code_point, index + 9)
return (code_point, index + 3)
def unescape(value: ByteString) -> ByteString:
unescaped = bytearray()
index = 0
length = len(value)
while index < length:
byte = value[index]
if byte == 0x5C:
index += 1 # Move past '\'
code_point, index = decode_escape_sequence(value, index)
unescaped.extend(utf8_encode(code_point))
else:
# Find invalid characters.
# Bytes that are less than 0x1f and not a continuation byte.
if (byte & 0x80) == 0:
# Single-byte code point
if byte <= 0x1F:
raise UnescapeError("invalid character")
unescaped.append(byte)
elif (byte & 0xE0) == 0xC0:
# Two-byte code point
if index + 1 > length:
raise UnescapeError("invalid code point")
unescaped.extend(value[index : index + 2])
index += 1
elif (byte & 0xF0) == 0xE0:
# Three-byte code point
if index + 2 > length:
raise UnescapeError("invalid code point")
unescaped.extend(value[index : index + 3])
index += 2
elif (byte & 0xF8) == 0xF0:
# Four-byte code point
if index + 3 > length:
raise UnescapeError("invalid code point")
unescaped.extend(value[index : index + 4])
index += 3
else:
raise UnescapeError("invalid character")
index += 1
return unescaped
test_cases = [
rb"abc",
b"a\xe2\x98\xbac",
rb'a\\"c',
rb"\u0061\u0062\u0063",
b"a\\\\c",
rb"a\u263Ac",
rb"a\\u263Ac",
rb"a\uD834\uDD1Ec",
rb"a\ud834\udd1ec",
rb"a\u263",
rb"a\u263Xc",
rb"a\uDD1Ec",
rb"a\uD834c",
rb"a\uD834\u263Ac",
]
if __name__ == "__main__":
for _bytes in test_cases:
try:
unescaped = unescape(_bytes)
print(f"{_bytes.hex(' '):<54} -> {unescaped.hex(' ')}")
except UnescapeError as err:
print(f"{_bytes.hex(' '):<54} -> {err}")
- Output:
61 62 63 -> 61 62 63 61 e2 98 ba 63 -> 61 e2 98 ba 63 61 5c 5c 22 63 -> 61 5c 22 63 5c 75 30 30 36 31 5c 75 30 30 36 32 5c 75 30 30 36 33 -> 61 62 63 61 5c 5c 63 -> 61 5c 63 61 5c 75 32 36 33 41 63 -> 61 e2 98 ba 63 61 5c 5c 75 32 36 33 41 63 -> 61 5c 75 32 36 33 41 63 61 5c 75 44 38 33 34 5c 75 44 44 31 45 63 -> 61 f0 9d 84 9e 63 61 5c 75 64 38 33 34 5c 75 64 64 31 65 63 -> 61 f0 9d 84 9e 63 61 5c 75 32 36 33 -> incomplete escape sequence 61 5c 75 32 36 33 58 63 -> invalid \uXXXX escape sequence 61 5c 75 44 44 31 45 63 -> unexpected low surrogate 61 5c 75 44 38 33 34 63 -> incomplete escape sequence at index 61 5c 75 44 38 33 34 5c 75 32 36 33 41 63 -> unexpected code point
Raku
# 20240816 Raku programming solution
grammar JSON-Unescape {
token TOP { ^ <value> $ }
token value { [ <str> | \\ <str=.str_escape> ]* }
token str { <-["\\\t\x[0A]]>+ }
token str_escape { <["\\/bfnrt]> | 'u' <utf16_codepoint>+ % '\u' }
token utf16_codepoint { <.xdigit>**4 }
}
class JSON-Unescape-Actions {
method TOP($/) { make $<value>.made }
method str($/) { make ~$/ }
method value($/) {
make +@$<str> == 1 ?? $<str>[0].made !! $<str>>>.made.join
}
method str_escape($/) {
make $<utf16_codepoint>.Bool
?? utf16.new( $<utf16_codepoint>.map({:16(~$_)}) ).decode()
!! %(< \\ / b n t f r " > Z=> < \\ / \b \n \t \f \r \" >).{~$/}
}
}
for < abc a☺c a\"c \u0061\u0062\u0063 a\\\c a\u263Ac a\\\u263Ac
a\uD834\uDD1Ec a\ud834\udd1ec a\\u263 a\\u263Xc
a\\uDD1Ec a\\uD834c a\\uD834\\u263Ac > -> $input {
my $o = JSON-Unescape.parse($input, actions => JSON-Unescape-Actions.new);
CATCH { default { say "Error: $_" } }
say $o.defined ?? $o.made !! "Error"
}
You may Attempt This Online!
- Output:
abc a☺c a"c abc a\c a☺c a\u263Ac a𝄞c a𝄞c Error Error Error: Malformed UTF-16; unexpected low surrogate Error: Malformed UTF-16; incomplete surrogate pair Error: Malformed UTF-16; incomplete surrogate pair
RPL
RPL is limited to an 8-bit ASCII character set, so we've tried here to preserve the spirit of the task. As the interpreter filters the backslash character (\
is ignored, \\
is turned into \
and \"
into "
) and is not directly accessible from the calculator keyboard, we use the pipe symbol |
instead.
« "" 9 CF @ flag 9 set signals an error 1 PICK3 SIZE FOR j OVER j DUP SUB IF DUP "|" == THEN DROP IF "'|bnfrta" PICK3 'j' INCR DUP SUB POS THEN { # 22h # 7Ch # 2Fh # 8h # Ch # Ah # Dh # 9h # 5Ch } LASTARG GET ELSE "#xxh" 2 4 PICK j 'j' INCR SUB REPL IFERR STR→ THEN 9 SF ELSE B→R CHR END END END IF 9 FC? THEN + ELSE DROP2 DUP "Error" →TAG OVER SIZE 'j' STO @ break on error END NEXT NIP » 'UNESC' STO @ ( "string" → "string" ) « { "abc" "a|'c" "|61|62|63" "a||c" "a|5Cc" "a||5Cc" "X|YZ" } 1 « UNESC » DOLIST DUP 1 DISP 7 FREEZE » 'TASK' STO
- Output:
{ "abc" "a"c" "abc" "a|c" "a\c" "a|5Cc" Error: "X|YZ" }
TypeScript
/* An exception thrown when a string contains an invalid escape sequence. */
class UnescapeError extends Error {
constructor(readonly message: string, readonly index: number) {
super(message);
Object.setPrototypeOf(this, new.target.prototype);
this.name = "UnescapeError";
this.index = index;
this.message = `${message}, at index ${index}`;
}
}
/* Unescape a JSON-like string value. */
function unescapeString(value: string): string {
const rv: string[] = [];
const length = value.length;
let index = 0;
let startIndex = 0;
let codepoint: number;
while (index < length) {
const ch = value[index];
if (ch === "\\") {
index += 1; // Move past '\'
switch (value[index]) {
case '"':
rv.push('"');
break;
case "\\":
rv.push("\\");
break;
case "/":
rv.push("/");
break;
case "b":
rv.push("\x08");
break;
case "f":
rv.push("\x0C");
break;
case "n":
rv.push("\n");
break;
case "r":
rv.push("\r");
break;
case "t":
rv.push("\t");
break;
case "u":
startIndex = index - 1;
[codepoint, index] = decodeHexChar(value, index);
rv.push(stringFromCodePoint(codepoint, startIndex));
break;
default:
throw new UnescapeError(`unknown escape sequence`, index - 1);
}
} else {
stringFromCodePoint(ch.codePointAt(0), index);
rv.push(ch);
}
index += 1;
}
return rv.join("");
}
/* Decode a `\uXXXX` or `\uXXXX\uXXXX` escape sequence from _value_ at _index_. */
function decodeHexChar(value: string, index: number): [number, number] {
const length = value.length;
if (index + 4 >= length) {
throw new UnescapeError(`incomplete escape sequence`, index - 1);
}
index += 1; // Move past 'u'
let codepoint = parseHexDigits(value.slice(index, index + 4), index - 2);
if (isLowSurrogate(codepoint)) {
throw new UnescapeError(`unexpected low surrogate code point`, index - 2);
}
if (isHighSurrogate(codepoint)) {
// Expect a surrogate pair.
if (
!(
index + 9 < length &&
value[index + 4] === "\\" &&
value[index + 5] === "u"
)
) {
throw new UnescapeError(`incomplete escape sequence`, index - 2);
}
const lowSurrogate = parseHexDigits(
value.slice(index + 6, index + 10),
index + 4
);
if (!isLowSurrogate(lowSurrogate)) {
throw new UnescapeError(`unexpected code point`, index + 4);
}
codepoint =
0x10000 + (((codepoint & 0x03ff) << 10) | (lowSurrogate & 0x03ff));
return [codepoint, index + 9];
}
return [codepoint, index + 3];
}
/* Parse a hexadecimal string as an integer. */
function parseHexDigits(digits: string, index: number): number {
const encoder = new TextEncoder();
let codepoint = 0;
for (const digit of encoder.encode(digits)) {
codepoint <<= 4;
switch (digit) {
case 48:
case 49:
case 50:
case 51:
case 52:
case 53:
case 54:
case 55:
case 56:
case 57:
codepoint |= digit - 48; // '0'
break;
case 65:
case 66:
case 67:
case 68:
case 69:
case 70:
codepoint |= digit - 65 + 10; // 'A'
break;
case 97:
case 98:
case 99:
case 100:
case 101:
case 102:
codepoint |= digit - 97 + 10; // 'a'
break;
default:
throw new UnescapeError("invalid \\uXXXX escape sequence", index);
}
}
return codepoint;
}
/* Check the codepoint is valid and return its string representation. */
function stringFromCodePoint(codepoint: number | undefined, index): string {
if (codepoint === undefined || codepoint <= 0x1f) {
throw new UnescapeError("invalid character", index);
}
try {
return String.fromCodePoint(codepoint);
} catch {
throw new UnescapeError("invalid escape sequence", index);
}
}
export function isHighSurrogate(codepoint: number): boolean {
return codepoint >= 0xd800 && codepoint <= 0xdbff;
}
export function isLowSurrogate(codepoint: number): boolean {
return codepoint >= 0xdc00 && codepoint <= 0xdfff;
}
const testCases = [
"abc",
"a☺c",
'a\\"c',
"\\u0061\\u0062\\u0063",
"a\\\\c",
"a\\u263Ac",
"a\\\\u263Ac",
"a\\uD834\\uDD1Ec",
"a\\ud834\\udd1ec",
"a\\u263",
"a\\u263Xc",
"a\\uDD1Ec",
"a\\uD834c",
"a\\uD834\\u263Ac",
];
for (const s of testCases) {
try {
console.log(`${s} -> ${unescapeString(s)}`);
} catch (err) {
if (err instanceof UnescapeError) {
console.log(`${s} -> ${err.message}`);
} else {
throw err;
}
}
}
- Output:
abc -> abc a☺c -> a☺c a\"c -> a"c \u0061\u0062\u0063 -> abc a\\c -> a\c a\u263Ac -> a☺c a\\u263Ac -> a\u263Ac a\uD834\uDD1Ec -> a𝄞c a\ud834\udd1ec -> a𝄞c a\u263 -> incomplete escape sequence, at index 1 a\u263Xc -> invalid \uXXXX escape sequence, at index 1 a\uDD1Ec -> unexpected low surrogate code point, at index 1 a\uD834c -> incomplete escape sequence, at index 1 a\uD834\u263Ac -> unexpected code point, at index 7
Wren
More or less.
class UnescapeError {
construct new (text, index) {
_message = text + ", at index %(index)"
}
toString { _message }
}
var isHighSurrogate = Fn.new { |cp| cp >= 0xd800 && cp <= 0xdbff }
var isLowSurrogate = Fn.new { |cp| cp >= 0xdc00 && cp <= 0xdfff }
/* Check the codepoint is valid and return its string representation. */
var stringFromCodePoint = Fn.new { |cp, index|
if (cp > 0x10ffff || cp <= 0x1f) {
return UnescapeError.new("invalid character", index)
}
return String.fromCodePoint(cp)
}
/* Parse a list of hexadecimal digits as an integer. */
var parseHexDigits = Fn.new { |digits, index|
var cp = 0
for (digit in digits.join("").bytes) {
cp = cp << 4
if (digit >= 48 && digit <= 57) {
cp = cp | (digit - 48) // '0'
} else if (digit >= 65 && digit <= 70) {
cp = cp | (digit - 65 + 10) // 'A'
} else if (digit >= 97 && digit <= 102) {
cp = cp | (digit - 97 + 10) // 'a'
} else {
return UnescapeError.new("invalid \\uXXXX escape sequence", index)
}
}
return cp
}
/* Decode a `\uXXXX` or `\uXXXX\uXXXX` escape sequence from character at index. */
var decodeHexChar = Fn.new { |chars, index|
var length = chars.count
if (index + 4 >= length) {
return UnescapeError.new("incomplete escape sequence", index - 1)
}
index = index + 1 // Move past 'u'
//var chars = value.toList
var cp = parseHexDigits.call(chars[index...index+4], index - 2)
if (cp is UnescapeError) return cp
if (isLowSurrogate.call(cp)) {
return UnescapeError.new("unexpected low surrogate code point", index - 2)
}
if (isHighSurrogate.call(cp)) {
// Expect a surrogate pair
if (!(index + 9 < length && chars[index + 4] == "\\" && chars[index + 5] == "u")) {
return UnescapeError.new("incomplete escape sequence", index - 2)
}
var lowSurrogate = parseHexDigits.call(chars[index+6...index+10], index + 4)
if (lowSurrogate is UnescapeError) return lowSurrogate
if (!isLowSurrogate.call(lowSurrogate)) {
return UnescapeError.new("unexpected code point", index + 4)
}
cp = 0x10000 + (((cp & 0x03ff) << 10) | (lowSurrogate & 0x03ff))
return [cp, index + 9]
}
return [cp, index + 3]
}
/* Unescape a JSON-like string value. */
var unescapeString = Fn.new { |value|
var rv = []
var index = 0
var startIndex = 0
var chars = value.toList
var length = chars.count
while (index < length) {
var ch = chars[index]
if (ch == "\\") {
index = index + 1 // Move past '\'
ch = chars[index]
if (ch == "\"") {
rv.add("\"")
} else if (ch == "\\") {
rv.add("\\")
} else if (ch == "/") {
rv.add("/")
} else if (ch == "b") {
rv.add("\b")
} else if (ch == "f") {
rv.add("\f")
} else if (ch == "n") {
rv.add("\n")
} else if (ch == "r") {
rv.add("\r")
} else if (ch == "t") {
rv.add("\t")
} else if (ch == "u") {
startIndex = index - 1
var res = decodeHexChar.call(chars, index)
if (res is UnescapeError) return res
var cp = res[0]
index = res[1]
var s = stringFromCodePoint.call(cp, startIndex)
if (s is UnescapeError) return s
rv.add(s)
} else {
return UnescapeError.new("unknown escape sequence", index)
}
} else {
var s = stringFromCodePoint.call(ch.codePoints[0], index)
if (s is UnescapeError) return s
rv.add(ch)
}
index = index + 1
}
return rv.join("")
}
var testCases = [
"abc",
"a☺c",
"a\\\"c",
"\\u0061\\u0062\\u0063",
"a\\\\c",
"a\\u263Ac",
"a\\\\u263Ac",
"a\\uD834\\uDD1Ec",
"a\\ud834\\udd1ec",
"a\\u263",
"a\\u263Xc",
"a\\uDD1Ec",
"a\\uD834c",
"a\\uD834\\u263Ac",
]
for (s in testCases) {
var us = unescapeString.call(s)
System.print("%(s) -> %(us)")
}
- Output:
abc -> abc a☺c -> a☺c a\"c -> a"c \u0061\u0062\u0063 -> abc a\\c -> a\c a\u263Ac -> a☺c a\\u263Ac -> a\u263Ac a\uD834\uDD1Ec -> a𝄞c a\ud834\udd1ec -> a𝄞c a\u263 -> incomplete escape sequence, at index 1 a\u263Xc -> invalid \uXXXX escape sequence, at index 1 a\uDD1Ec -> unexpected low surrogate code point, at index 1 a\uD834c -> incomplete escape sequence, at index 1 a\uD834\u263Ac -> unexpected code point, at index 7