Character codes: Difference between revisions
Content added Content deleted
Line 882: | Line 882: | ||
[[Category:Scala Implementations]] |
[[Category:Scala Implementations]] |
||
{{libheader|Scala}} |
{{libheader|Scala}} |
||
Scala supports unicode characters, but each character is UTF-16, so |
Scala supports unicode characters, but each character is UTF-16, so there is not a 1-to-1 relationship for supplementary character sets. |
||
===In a REPL session=== |
|||
there is not a 1-to-1 relationship for supplementary character sets. |
|||
Without worrying about supplemental character sets: |
|||
<lang scala>scala> 'a' toInt |
<lang scala>scala> 'a' toInt |
||
res2: Int = 97 |
res2: Int = 97 |
||
Line 897: | Line 895: | ||
scala> "\uD869\uDEA5" |
scala> "\uD869\uDEA5" |
||
res5: String = 𪚥</lang> |
res5: String = 𪚥</lang> |
||
===Full swing workout=== |
|||
Taken the supplemental character sets in account. |
|||
<lang scala>import java.lang.Character._; import scala.annotation.tailrec |
|||
object CharacterCode extends App { |
|||
def intToChars(n: Int): Array[Char] = java.lang.Character.toChars(n) |
|||
def UnicodeToList(UTFstring: String) = { |
|||
@tailrec |
|||
def inner(str: List[Char], acc: List[String], surrogateHalf: Option[Char]): List[String] = { |
|||
(str, surrogateHalf) match { |
|||
case (Nil, _) => acc |
|||
case (ch :: rest, None) => if (ch.isSurrogate) inner(rest, acc, Some(ch)) |
|||
else inner(rest, acc :+ ch.toString, None) |
|||
case (ch :: rest, Some(f)) => inner(rest, (acc :+ (f.toString + ch)), None) |
|||
} |
|||
} |
|||
inner(UTFstring.toList, Nil, None) |
|||
} |
|||
def UnicodeToInt(utf: String) = { |
|||
def charToInt(high: Char, low: Char) = |
|||
{ if (isSurrogatePair(high, low)) toCodePoint(high, low) else high.toInt } |
|||
charToInt(utf(0), if (utf.size > 1) utf(1) else 0) |
|||
} |
|||
def UTFtoHexString(utf: String) = { utf.map(ch => f"${ch.toInt}%04X").mkString("\"\\u", "\\u", "\"") } |
|||
def flags(ch: String) = { // Testing Unicode character properties |
|||
(if (ch matches "\\p{M}") "Y" else "N") + (if (ch matches "\\p{Mn}") "Y" else "N") |
|||
} |
|||
val str = '\uFEFF' /*big-endian BOM*/ + "\u0301a" + |
|||
"$áabcde¢£¤¥©ÇßIJijŁłʒλπक्तु•₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵℃←→⇒∙⌘☃☹☺☻ア字文𠀀" + intToChars(173733).mkString |
|||
println(s"Example string: $str") |
|||
Worrying about supplemental character sets, we need to test the "next" character as well: |
|||
println(""" | Chr C/C++/Java source Code Point Hex Dec Mn Name |
|||
<lang scala>def charToInt(c: Char, next: Char): Option[Int] = (c, next) match { |
|||
!----+ --- ------------------------- ------- -------- -- """.stripMargin('!') + "-" * 27) |
|||
case _ if (c.isHighSurrogate && next.isLowSurrogate) => Some(java.lang.Character.toCodePoint(c, next)) |
|||
case _ if (c.isLowSurrogate) => None |
|||
case _ => Some(c.toInt) |
|||
} |
|||
(UnicodeToList(str)).zipWithIndex.map { |
|||
def intToChars(n: Int): Array[Char] = java.lang.Character.toChars(n)</lang> |
|||
case (coll, nr) => |
|||
[http://illegalargumentexception.blogspot.nl/2009/05/java-rough-guide-to-character-encoding.html More background info] |
|||
f"$nr%4d: $coll\t${UTFtoHexString(coll)}%27s U+${UnicodeToInt(coll)}%05X" + |
|||
f"${"(" + UnicodeToInt(coll).toString}%8s) ${flags(coll)} ${getName(coll(0).toInt)} " |
|||
}.foreach(println) |
|||
}</lang> |
|||
{{Output}} |
|||
<pre style="height:20ex;overflow:scroll">Example string: ́a$áabcde¢£¤¥©ÇßIJijŁłʒλπक्तु•₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵℃←→⇒∙⌘☃☹☺☻ア字文𠀀𪚥 |
|||
| Chr C/C++/Java source Code Point Hex Dec Mn Name |
|||
----+ --- ------------------------- ------- -------- -- --------------------------- |
|||
0: "\uFEFF" U+0FEFF (65279) NN ZERO WIDTH NO-BREAK SPACE |
|||
1: ́ "\u0301" U+00301 (769) YY COMBINING ACUTE ACCENT |
|||
2: a "\u0061" U+00061 (97) NN LATIN SMALL LETTER A |
|||
3: $ "\u0024" U+00024 (36) NN DOLLAR SIGN |
|||
4: á "\u00E1" U+000E1 (225) NN LATIN SMALL LETTER A WITH ACUTE |
|||
5: a "\u0061" U+00061 (97) NN LATIN SMALL LETTER A |
|||
6: b "\u0062" U+00062 (98) NN LATIN SMALL LETTER B |
|||
7: c "\u0063" U+00063 (99) NN LATIN SMALL LETTER C |
|||
8: d "\u0064" U+00064 (100) NN LATIN SMALL LETTER D |
|||
9: e "\u0065" U+00065 (101) NN LATIN SMALL LETTER E |
|||
10: ¢ "\u00A2" U+000A2 (162) NN CENT SIGN |
|||
11: £ "\u00A3" U+000A3 (163) NN POUND SIGN |
|||
12: ¤ "\u00A4" U+000A4 (164) NN CURRENCY SIGN |
|||
13: ¥ "\u00A5" U+000A5 (165) NN YEN SIGN |
|||
14: © "\u00A9" U+000A9 (169) NN COPYRIGHT SIGN |
|||
15: Ç "\u00C7" U+000C7 (199) NN LATIN CAPITAL LETTER C WITH CEDILLA |
|||
16: ß "\u00DF" U+000DF (223) NN LATIN SMALL LETTER SHARP S |
|||
17: IJ "\u0132" U+00132 (306) NN LATIN CAPITAL LIGATURE IJ |
|||
18: ij "\u0133" U+00133 (307) NN LATIN SMALL LIGATURE IJ |
|||
19: Ł "\u0141" U+00141 (321) NN LATIN CAPITAL LETTER L WITH STROKE |
|||
20: ł "\u0142" U+00142 (322) NN LATIN SMALL LETTER L WITH STROKE |
|||
21: ʒ "\u0292" U+00292 (658) NN LATIN SMALL LETTER EZH |
|||
22: λ "\u03BB" U+003BB (955) NN GREEK SMALL LETTER LAMDA |
|||
23: π "\u03C0" U+003C0 (960) NN GREEK SMALL LETTER PI |
|||
24: क "\u0915" U+00915 (2325) NN DEVANAGARI LETTER KA |
|||
25: ् "\u094D" U+0094D (2381) YY DEVANAGARI SIGN VIRAMA |
|||
26: त "\u0924" U+00924 (2340) NN DEVANAGARI LETTER TA |
|||
27: ु "\u0941" U+00941 (2369) YY DEVANAGARI VOWEL SIGN U |
|||
28: • "\u2022" U+02022 (8226) NN BULLET |
|||
29: ₠ "\u20A0" U+020A0 (8352) NN EURO-CURRENCY SIGN |
|||
30: ₡ "\u20A1" U+020A1 (8353) NN COLON SIGN |
|||
31: ₢ "\u20A2" U+020A2 (8354) NN CRUZEIRO SIGN |
|||
32: ₣ "\u20A3" U+020A3 (8355) NN FRENCH FRANC SIGN |
|||
33: ₤ "\u20A4" U+020A4 (8356) NN LIRA SIGN |
|||
34: ₥ "\u20A5" U+020A5 (8357) NN MILL SIGN |
|||
35: ₦ "\u20A6" U+020A6 (8358) NN NAIRA SIGN |
|||
36: ₧ "\u20A7" U+020A7 (8359) NN PESETA SIGN |
|||
37: ₨ "\u20A8" U+020A8 (8360) NN RUPEE SIGN |
|||
38: ₩ "\u20A9" U+020A9 (8361) NN WON SIGN |
|||
39: ₪ "\u20AA" U+020AA (8362) NN NEW SHEQEL SIGN |
|||
40: ₫ "\u20AB" U+020AB (8363) NN DONG SIGN |
|||
41: € "\u20AC" U+020AC (8364) NN EURO SIGN |
|||
42: ₭ "\u20AD" U+020AD (8365) NN KIP SIGN |
|||
43: ₮ "\u20AE" U+020AE (8366) NN TUGRIK SIGN |
|||
44: ₯ "\u20AF" U+020AF (8367) NN DRACHMA SIGN |
|||
45: ₰ "\u20B0" U+020B0 (8368) NN GERMAN PENNY SIGN |
|||
46: ₱ "\u20B1" U+020B1 (8369) NN PESO SIGN |
|||
47: ₲ "\u20B2" U+020B2 (8370) NN GUARANI SIGN |
|||
48: ₳ "\u20B3" U+020B3 (8371) NN AUSTRAL SIGN |
|||
49: ₴ "\u20B4" U+020B4 (8372) NN HRYVNIA SIGN |
|||
50: ₵ "\u20B5" U+020B5 (8373) NN CEDI SIGN |
|||
51: ℃ "\u2103" U+02103 (8451) NN DEGREE CELSIUS |
|||
52: ← "\u2190" U+02190 (8592) NN LEFTWARDS ARROW |
|||
53: → "\u2192" U+02192 (8594) NN RIGHTWARDS ARROW |
|||
54: ⇒ "\u21D2" U+021D2 (8658) NN RIGHTWARDS DOUBLE ARROW |
|||
55: ∙ "\u2219" U+02219 (8729) NN BULLET OPERATOR |
|||
56: ⌘ "\u2318" U+02318 (8984) NN PLACE OF INTEREST SIGN |
|||
57: ☃ "\u2603" U+02603 (9731) NN SNOWMAN |
|||
58: ☹ "\u2639" U+02639 (9785) NN WHITE FROWNING FACE |
|||
59: ☺ "\u263A" U+0263A (9786) NN WHITE SMILING FACE |
|||
60: ☻ "\u263B" U+0263B (9787) NN BLACK SMILING FACE |
|||
61: ア "\u30A2" U+030A2 (12450) NN KATAKANA LETTER A |
|||
62: 字 "\u5B57" U+05B57 (23383) NN CJK UNIFIED IDEOGRAPHS 5B57 |
|||
63: 文 "\u6587" U+06587 (25991) NN CJK UNIFIED IDEOGRAPHS 6587 |
|||
64: "\uF8FF" U+0F8FF (63743) NN PRIVATE USE AREA F8FF |
|||
65: 𠀀 "\uD840\uDC00" U+20000 (131072) NN HIGH SURROGATES D840 |
|||
66: 𪚥 "\uD869\uDEA5" U+2A6A5 (173733) NN HIGH SURROGATES D869</pre>[http://illegalargumentexception.blogspot.nl/2009/05/java-rough-guide-to-character-encoding.html More background info: "Java: a rough guide to character encoding"] |
|||
=={{header|Scheme}}== |
=={{header|Scheme}}== |