note description: "[ Converter from/to UTF-8, UTF-16 and UTF-32 encodings. Handling of invalid encodings ============================= Whenever a UTF-8 or UTF-16 sequence is decoded, the decoding routines also check that the sequence is valid. If it is not, it will replace the invalid unit (e.g. a byte for UTF-8 and a 2-byte for UTF-16 by the replacement character U+FFFD as described by variant #3 of the recommended practice for replacement character in Unicode (see for more details). However it means that you cannot roundtrip incorrectly encoded sequence back and forth between the encoded version and the decoded STRING_32 version. To allow roundtrip, an escaped representation of a bad encoded sequence has been introduced. It is adding a a fourth variant (which is a slight modification of variant #3) to the recommended practice where the replacement character is followed by the printed hexadecimal value of the invalid byte or the invalid 2-byte sequence. To provide an example (assuming that the Unicode character U+FFFD is represented as ? textually): 1 - on UNIX, any invalid UTF-8 byte sequence such as 0x8F 0x8F is encoded as the following Unicode sequence: U+FFFD U+0038 U+0046 U+FFFF U+0038 U+0046, and textually it looks like "?8F?8F". 2 - on Windows, any invalid UTF-16 2-byte sequence such as 0xD800 0x0054 is encoded as the following Unicode sequence: U+FFFD U+0075 U+0044 U+0038 U+0030 U+0030 U+FFFD U+0035 U+0033, and textually it looks like "?uD800?54". The rule is that if the 2-byte sequence does not fit into 1 byte, it uses the letter `u' followed by the hexadecimal value of the 2-byte sequence, otherwise it simply uses the 1-byte hexadecimal representation. ]" date: "$Date$" revision: "$Revision$" expanded class UTF_CONVERTER feature -- Access escape_character: CHARACTER_32 = '%/0xFFFD/' -- Unicode replacement character to escape invalid UTF-8 or UTF-16 encoding. -- UTF-8 encoding: 0xEF 0xBF 0xBD -- Binary UTF-8 encoding: 11101111 10111111 10111101 -- UTF-16 encoding: 0xFFFD feature -- Status report is_valid_utf_8_string_8 (s: READABLE_STRING_8): BOOLEAN -- Is `s' a valid UTF-8 Unicode sequence? local c: NATURAL_32 i, nb: INTEGER do from nb := s.count Result := True until i >= nb or not Result loop i := i + 1 c := s.code (i) if c <= 127 then -- Form 0xxxxxxx. elseif (c & 0xE0) = 0xC0 and i < nb then -- Form 110xxxxx 10xxxxxx. i := i + 1 Result := (s.code (i) & 0xC0) = 0x80 elseif (c & 0xF0) = 0xE0 and i + 1 < nb then -- Form 1110xxxx 10xxxxxx 10xxxxxx. i := i + 2 Result := (s.code (i - 1) & 0xC0) = 0x80 and (s.code (i) & 0xC0) = 0x80 elseif (c & 0xF8) = 0xF0 and i + 2 < nb then -- Form 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. i := i + 3 Result := (s.code (i - 2) & 0xC0) = 0x80 and (s.code (i - 1) & 0xC0) = 0x80 and (s.code (i) & 0xC0) = 0x80 else -- Anything else is not a valid UTF-8 sequence that would yield a valid Unicode character. Result := False end end ensure instance_free: class end is_valid_utf_16le_string_8 (s: READABLE_STRING_8): BOOLEAN -- Is `s' a valid UTF-16LE Unicode sequence? local c1, c2: NATURAL_32 i, nb: INTEGER do nb := s.count -- If `nb' is not even, then clearly not a valid UTF-16 string. if (nb \\ 2) = 0 then from Result := True until i >= nb or not Result loop i := i + 2 c1 := s.code (i - 1) | (s.code (i) |<< 8) if c1 < 0xD800 or c1 >= 0xE000 then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit, this is valid Unicode. elseif c1 <= 0xDBFF then i := i + 2 if i <= nb then c2 := s.code (i - 1) | (s.code (i) |<< 8) Result := 0xDC00 <= c2 and c2 <= 0xDFF else -- Surrogate pair is incomplete, clearly not a valid UTF-16 sequence. Result := False end else -- Invalid starting surrogate pair which should be between 0xD800 and 0xDBFF. Result := False end end end ensure instance_free: class end is_valid_utf_16_subpointer (p: MANAGED_POINTER; start_pos, end_pos: INTEGER; a_stop_at_null: BOOLEAN): BOOLEAN -- Is `p' a valid UTF-16 Unicode sequence between code unit `start_pos' and `end_pos'? -- If `a_stop_at_null' we stop checking after finding a null character. local i, n: INTEGER c1, c2: NATURAL_32 do if p.count >= 2 and start_pos >= 0 and start_pos <= end_pos + 1 and end_pos < (p.count // 2) then from i := start_pos * 2 n := end_pos * 2 Result := True until i > n or not Result loop c1 := p.read_natural_16 (i) if c1 = 0 and a_stop_at_null then -- We hit our null terminating character, we can stop i := n + 1 else if c1 < 0xD800 or c1 >= 0xE000 then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit, this is valid Unicode. i := i + 1 elseif c1 <= 0xDBFF then i := i + 2 if i <= n then c2 := p.read_natural_16 (i) Result := 0xDC00 <= c2 and c2 <= 0xDFF else -- Surrogate pair is incomplete, clearly not a valid UTF-16 sequence. Result := False end else -- Invalid starting surrogate pair which should be between 0xD800 and 0xDBFF. Result := False end end end end ensure instance_free: class end is_valid_utf_16 (s: SPECIAL [NATURAL_16]): BOOLEAN -- Is `s' a valid UTF-16 Unicode sequence? local i, n: INTEGER c: NATURAL_16 do from i := 0 n := s.count Result := True until i >= n or not Result loop c := s.item (i) if c < 0xD800 or c >= 0xE000 then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit, this is valid Unicode. elseif c <= 0xDBFF then i := i + 1 if i < n then c := s.item (i) Result := 0xDC00 <= c and c <= 0xDFF else -- Surrogate pair is incomplete, clearly not a valid UTF-16 sequence. Result := False end else -- Invalid starting surrogate pair which should be between 0xD800 and 0xDBFF. Result := False end i := i + 1 end ensure instance_free: class end feature -- Measurement utf_8_bytes_count (s: READABLE_STRING_GENERAL; start_pos, end_pos: INTEGER): INTEGER -- Number of bytes necessary to encode in UTF-8 `s.substring (start_pos, end_pos)'. -- Note that this feature can be used for both escaped and non-escaped string. -- In the case of escaped strings, the result will be possibly higher than really needed. -- It does not include the terminating null character. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count local i: INTEGER c: NATURAL_32 do from i := start_pos until i > end_pos loop c := s.code (i) if c <= 0x7F then -- 0xxxxxxx. Result := Result + 1 elseif c <= 0x7FF then -- 110xxxxx 10xxxxxx Result := Result + 2 elseif c <= 0xFFFF then -- 1110xxxx 10xxxxxx 10xxxxxx Result := Result + 3 else -- c <= 1FFFFF - there are no higher code points -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx Result := Result + 4 end i := i + 1 end ensure instance_free: class end utf_16_characters_count_form_pointer (m: MANAGED_POINTER; start_pos, end_pos: INTEGER): INTEGER -- Number of characters of the UTF-16 encoded `m' starting at `start_pos' in `m' up to `end_pos - 1'. -- It does not include the terminating null character. require start_position_big_enough: start_pos >= 0 end_position: start_pos <= end_pos + 2 end_pos_small_enought: end_pos < m.count even_start_position: start_pos \\ 2 = 0 even_end_position: end_pos \\ 2 = 0 local i, n: INTEGER c: NATURAL_32 do from i := start_pos n := end_pos until i >= end_pos loop c := m.read_natural_16 (i) if c < 0xD800 or c >= 0xE000 then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. i := i + 2 elseif i <= n then -- Supplementary Planes: surrogate pair with lead and trail surrogates. i := i + 4 end Result := Result + 1 end ensure instance_free: class end utf_16_bytes_count (s: READABLE_STRING_GENERAL; start_pos, end_pos: INTEGER): INTEGER -- Number of bytes necessary at the very least to encode in UTF-16 `s.substring (start_pos, end_pos)'. -- Note that this feature can be used for both escaped and non-escaped string. -- In the case of escaped strings, the result will be possibly higher than really needed. -- It does not include the terminating null character. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count local i: INTEGER c: NATURAL_32 do from i := start_pos until i > end_pos loop c := s.code (i) if c <= 0xFFFF then -- Code point from Basic Multilingual Plane: one 16-bit code unit. Result := Result + 2 else Result := Result + 4 end i := i + 1 end ensure instance_free: class end utf_8_to_string_32_count (s: SPECIAL [CHARACTER]; start_pos, end_pos: INTEGER): INTEGER -- Count of characters corresponding to UTF-8 sequence `s'. require start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < s.count local i: INTEGER n: INTEGER c: INTEGER do from i := start_pos n := end_pos until i > n loop c := s [i].code if c <= 0x7F then -- 0xxxxxxx i := i + 1 Result := Result + 1 elseif c <= 0xDF then -- 110xxxxx 10xxxxxx i := i + 2 if i <= n then Result := Result + 1 end elseif c <= 0xEF then -- 1110xxxx 10xxxxxx 10xxxxxx i := i + 3 if i <= n then Result := Result + 1 end elseif c <= 0xF7 then -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx i := i + 4 if i <= n then Result := Result + 1 end end end ensure instance_free: class end feature -- UTF-32 to UTF-8 string_32_to_utf_8_string_8 (s: READABLE_STRING_32): STRING_8 -- UTF-8 sequence corresponding to `s'. do Result := utf_32_string_to_utf_8_string_8 (s) ensure instance_free: class roundtrip: utf_8_string_8_to_string_32 (Result).same_string (s) end string_32_into_utf_8_string_8 (s: READABLE_STRING_32; a_result: STRING_8) -- Copy the UTF-8 sequence corresponding to `s' appended into `a_result'. do utf_32_string_into_utf_8_string_8 (s, a_result) ensure instance_free: class roundtrip: utf_8_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end utf_32_string_to_utf_8_string_8 (s: READABLE_STRING_GENERAL): STRING_8 -- UTF-8 sequence corresponding to `s' interpreted as a UTF-32 sequence. do create Result.make (s.count) utf_32_string_into_utf_8_string_8 (s, Result) ensure instance_free: class roundtrip: utf_8_string_8_to_string_32 (Result).same_string_general (s) end utf_32_string_into_utf_8_string_8 (s: READABLE_STRING_GENERAL; a_result: STRING_8) -- Copy the UTF-8 sequence corresponding to `s' interpreted as a UTF-32 sequence -- appended into `a_result'. local i: like {STRING_32}.count n: like {STRING_32}.count do from n := s.count a_result.grow (a_result.count + n) until i >= n loop i := i + 1 utf_32_code_into_utf_8_string_8 (s.code (i), a_result) end ensure instance_free: class roundtrip: utf_8_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string_general (s) end utf_32_code_into_utf_8_string_8 (c: NATURAL_32; a_result: STRING_8) -- Copy the UTF-8 sequence corresponding to code `c' appended into `a_result'. do if c <= 0x7F then -- 0xxxxxxx a_result.extend (c.to_character_8) elseif c <= 0x7FF then -- 110xxxxx 10xxxxxx a_result.extend (((c |>> 6) | 0xC0).to_character_8) a_result.extend (((c & 0x3F) | 0x80).to_character_8) elseif c <= 0xFFFF then -- 1110xxxx 10xxxxxx 10xxxxxx a_result.extend (((c |>> 12) | 0xE0).to_character_8) a_result.extend ((((c |>> 6) & 0x3F) | 0x80).to_character_8) a_result.extend (((c & 0x3F) | 0x80).to_character_8) else -- c <= 1FFFFF - there are no higher code points -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx a_result.extend (((c |>> 18) | 0xF0).to_character_8) a_result.extend ((((c |>> 12) & 0x3F) | 0x80).to_character_8) a_result.extend ((((c |>> 6) & 0x3F) | 0x80).to_character_8) a_result.extend (((c & 0x3F) | 0x80).to_character_8) end ensure instance_free: class end escaped_utf_32_substring_into_utf_8_0_pointer ( s: READABLE_STRING_GENERAL; start_pos, end_pos: INTEGER; p: MANAGED_POINTER; p_offset: INTEGER; a_new_upper: detachable CELL [INTEGER] ) -- Write UTF-8 sequence corresponding to `s', interpreted as a UTF-32 sequence that could -- be escaped, with terminating zero to address `p + p_offset' and update the size of `p' to the -- number of written bytes. -- If `a_new_upper' is provided, the upper index of `p' containing the zero-termination -- is written to `a_new_upper'. -- The sequence is zero-terminated. -- If `s' contains the `escape_character' followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then `s' has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count p_offset_non_negative: p_offset >= 0 local i, n, m, l_count: INTEGER c: NATURAL_32 l_encoded_value: READABLE_STRING_GENERAL l_decoded, l_resized: BOOLEAN do -- Basic assumptions that there will be only one-byte code units. n := end_pos - start_pos + 1 l_count := p.count -- Check that there is at least `n' bytes available plus the terminating null character. if l_count - p_offset < (n + 1) then -- Optimize resizing, once we have to resize, we actually perform the resizing -- only once. l_count := p_offset + utf_8_bytes_count (s, start_pos, end_pos) + 1 p.resize (l_count) l_resized := True end from m := p_offset i := start_pos - 1 until i >= end_pos loop i := i + 1 c := s.code (i) if c = escape_character.natural_32_code then -- We might be facing a character that was escaped. -- In the Unix case, we only accept the 1-byte encoded format. if i < n and then s.item (i + 1) = escape_character then -- The `escape_character' was escaped, it meant they really wanted an `escape_character'. i := i + 1 elseif i + 1 < n then -- We have at least 2 characters to read, make sure they represent an hexadecimal -- value. l_encoded_value := s.substring (i + 1, i + 2) if is_hexa_decimal (l_encoded_value) then c := to_natural_32 (l_encoded_value) if c <= 0x7F then -- Value was encoded when it should not have been -- do nothing, we leave the original content as is. c := escape_character.natural_32_code else l_decoded := True i := i + 2 end else -- Not an hexadecimal value, it was not escaped. end else -- Not enough to read to make it valid, it was not escaped. end end if not l_decoded then if c <= 0x7F then -- 0xxxxxxx p.put_natural_8 (c.to_natural_8, m) m := m + 1 else -- Make sure there is sufficient room for all the remaining characters and -- at least 5 bytes, i.e. 4 bytes for the maximum UTF-8 encoding, -- and one byte for the terminating null character. Note that we do not -- take into account `p_offset' because `m' already includes it. -- Note that `end_pos - i' represents the number of remaining characters -- to process in the current string. if not l_resized and then (m + 5 + (end_pos - i) > l_count) then -- Optimize resizing, once we have to resize, we actually perform the resizing -- only once. l_count := m + utf_8_bytes_count (s, i, end_pos) + 1 p.resize (l_count) l_resized := True end if c <= 0x7FF then -- 110xxxxx 10xxxxxx p.put_natural_8 (((c |>> 6) | 0xC0).to_natural_8, m) p.put_natural_8 (((c & 0x3F) | 0x80).to_natural_8, m + 1) m := m + 2 elseif c <= 0xFFFF then -- 1110xxxx 10xxxxxx 10xxxxxx p.put_natural_8 (((c |>> 12) | 0xE0).to_natural_8, m) p.put_natural_8 ((((c |>> 6) & 0x3F) | 0x80).to_natural_8, m + 1) p.put_natural_8 (((c & 0x3F) | 0x80).to_natural_8, m + 2) m := m + 3 else -- c <= 1FFFFF - there are no higher code points -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx p.put_natural_8 (((c |>> 18) | 0xF0).to_natural_8, m) p.put_natural_8 ((((c |>> 12) & 0x3F) | 0x80).to_natural_8, m + 1) p.put_natural_8 ((((c |>> 6) & 0x3F) | 0x80).to_natural_8, m + 2) p.put_natural_8 (((c & 0x3F) | 0x80).to_natural_8, m + 3) m := m + 4 end end else l_decoded := False -- Simply put decoded value directly in stream. p.put_natural_8 (c.to_natural_8, m) m := m + 1 end end if l_resized then -- `p' was resized so we adjust it to accommodate up to the terminating null character. p.resize (m + 1) end p.put_natural_8 (0, m) if a_new_upper /= Void then a_new_upper.put (m) end ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, a_new_upper.item - 1, False).same_string_general (s.substring (start_pos, end_pos)) roundtrip: (a_new_upper = Void and then not s.substring (start_pos, end_pos).has ('%U')) implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, p.count, True).same_string_general (s.substring (start_pos, end_pos)) end escaped_utf_32_string_to_utf_8_string_8 (s: READABLE_STRING_GENERAL): STRING_8 -- UTF-8 sequence corresponding to `s' interpreted as a UTF-32 sequence that could be escaped. -- If `s' contains the `escape_character' followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then `s' has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. do create Result.make (s.count) escaped_utf_32_string_into_utf_8_string_8 (s, Result) ensure instance_free: class roundtrip: utf_8_string_8_to_escaped_string_32 (Result).same_string_general (s) end escaped_utf_32_string_into_utf_8_string_8 (s: READABLE_STRING_GENERAL; a_result: STRING_8) -- Copy the UTF-8 sequence corresponding to `s' interpreted as a UTF-32 sequence that could -- be escaped appended into `a_result'. -- If `s' contains the `escape_character' followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then `s' has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. local i: like {STRING_32}.count n: like {STRING_32}.count c: NATURAL_32 l_encoded_value: READABLE_STRING_GENERAL l_decoded: BOOLEAN do from n := s.count a_result.grow (a_result.count + n) until i >= n loop i := i + 1 c := s.code (i) if c = escape_character.natural_32_code then -- We might be facing a character that was escaped. -- In the Unix case, we only accept the 1-byte encoded format. if i < n and then s.item (i + 1) = escape_character then -- The `escape_character' was escaped, it meant they really wanted an `escape_character'. i := i + 1 elseif i + 1 < n then -- We have at least 2 characters to read, make sure they represent an hexadecimal -- value. l_encoded_value := s.substring (i + 1, i + 2) if is_hexa_decimal (l_encoded_value) then c := to_natural_32 (l_encoded_value) if c <= 0x7F then -- Value was encoded when it should not have been -- do nothing, we leave the original content as is. c := escape_character.natural_32_code else l_decoded := True i := i + 2 end else -- Not an hexadecimal value, it was not escaped. end else -- Not enough to read to make it valid, it was not escaped. end end if not l_decoded then if c <= 0x7F then -- 0xxxxxxx a_result.extend (c.to_character_8) elseif c <= 0x7FF then -- 110xxxxx 10xxxxxx a_result.extend (((c |>> 6) | 0xC0).to_character_8) a_result.extend (((c & 0x3F) | 0x80).to_character_8) elseif c <= 0xFFFF then -- 1110xxxx 10xxxxxx 10xxxxxx a_result.extend (((c |>> 12) | 0xE0).to_character_8) a_result.extend ((((c |>> 6) & 0x3F) | 0x80).to_character_8) a_result.extend (((c & 0x3F) | 0x80).to_character_8) else -- c <= 1FFFFF - there are no higher code points -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx a_result.extend (((c |>> 18) | 0xF0).to_character_8) a_result.extend ((((c |>> 12) & 0x3F) | 0x80).to_character_8) a_result.extend ((((c |>> 6) & 0x3F) | 0x80).to_character_8) a_result.extend (((c & 0x3F) | 0x80).to_character_8) end else l_decoded := False -- Simply put decoded value directly in stream. a_result.extend (c.to_character_8) end end ensure instance_free: class roundtrip: utf_8_string_8_to_escaped_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string_general (s) end string_32_into_utf_8_0_pointer (s: READABLE_STRING_32; p: MANAGED_POINTER; p_offset: INTEGER; a_new_upper: detachable CELL [INTEGER]) -- Write UTF-8 sequence corresponding to `s' with terminating zero -- to address `p + p_offset' and update the size of `p' to the number of written bytes. -- If `a_new_upper' is provided, the upper index of `p' containing the zero-termination -- is written to `a_new_upper'. -- The sequence is zero-terminated. require p_offset_non_negative: p_offset >= 0 do utf_32_string_into_utf_8_0_pointer (s, p, p_offset, a_new_upper) ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, a_new_upper.item - 1, False).same_string (s) roundtrip: (a_new_upper = Void and then not s.has ('%U')) implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, p.count, True).same_string_general (s) end utf_32_string_into_utf_8_0_pointer (s: READABLE_STRING_GENERAL; p: MANAGED_POINTER; p_offset: INTEGER; a_new_upper: detachable CELL [INTEGER]) -- Write UTF-8 sequence corresponding to `s', interpreted as a UTF-32 sequence, -- with terminating zero to address `p + p_offset' and update the size of `p' to the -- number of written bytes. -- If `a_new_upper' is provided, the upper index of `p' containing the zero-termination -- is written to `a_new_upper'. -- The sequence is zero-terminated. require p_offset_non_negative: p_offset >= 0 local m: INTEGER i, n, l_count: INTEGER c: NATURAL_32 l_resized: BOOLEAN do -- Basic assumptions that there will be only one-byte code units. n := s.count l_count := p.count -- Check that there is at least `n' bytes available plus the terminating null character. if l_count - p_offset < (n + 1) then -- Optimize resizing, once we have to resize, we actually perform the resizing -- only once. l_count := p_offset + utf_8_bytes_count (s, 1, n) + 1 p.resize (l_count) l_resized := True end -- Fill `p' with the converted data. from i := 0 m := p_offset until i >= n loop i := i + 1 c := s.code (i) if c <= 0x7F then -- 0xxxxxxx. p.put_natural_8 (c.to_natural_8, m) m := m + 1 else -- Make sure there is sufficient room for all the remaining characters and -- at least 5 bytes, i.e. 4 bytes for the maximum UTF-8 encoding, -- and one byte for the terminating null character. Note that we do not -- take into account `p_offset' because `m' already includes it. -- Note that `n - i' represents the number of remaining characters -- to process in the current string. if not l_resized and then (m + 5 + (n - i) > l_count) then -- Optimize resizing, once we have to resize, we actually perform the resizing -- only once. l_count := m + utf_8_bytes_count (s, i, n) + 1 p.resize (l_count) l_resized := True end if c <= 0x7FF then -- 110xxxxx 10xxxxxx. p.put_natural_8 (((c |>> 6) | 0xC0).to_natural_8, m) p.put_natural_8 (((c & 0x3F) | 0x80).to_natural_8, m + 1) m := m + 2 elseif c <= 0xFFFF then -- 1110xxxx 10xxxxxx 10xxxxxx p.put_natural_8 (((c |>> 12) | 0xE0).to_natural_8, m) p.put_natural_8 ((((c |>> 6) & 0x3F) | 0x80).to_natural_8, m + 1) p.put_natural_8 (((c & 0x3F) | 0x80).to_natural_8, m + 2) m := m + 3 else -- c <= 1FFFFF - there are no higher code points -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx p.put_natural_8 (((c |>> 18) | 0xF0).to_natural_8, m) p.put_natural_8 ((((c |>> 12) & 0x3F) | 0x80).to_natural_8, m + 1) p.put_natural_8 ((((c |>> 6) & 0x3F) | 0x80).to_natural_8, m + 2) p.put_natural_8 (((c & 0x3F) | 0x80).to_natural_8, m + 3) m := m + 4 end end end if l_resized then -- `p' was resized so we adjust it to accommodate up to the terminating null character. p.resize (m + 1) end p.put_natural_8 (0, m) if a_new_upper /= Void then a_new_upper.put (m) end ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, a_new_upper.item - 1, False).same_string_general (s) roundtrip: (a_new_upper = Void and then not s.has ('%U')) implies utf_8_0_subpointer_to_escaped_string_32 (p, p_offset, p.count, True).same_string_general (s) end utf_32_string_to_utf_8 (s: READABLE_STRING_GENERAL): SPECIAL [NATURAL_8] -- UTF-8 sequence corresponding to `s', interpreted as a UTF-32 sequence. -- The sequence is not zero-terminated. do Result := utf_32_string_to_utf_8_0 (s) Result := Result.aliased_resized_area_with_default (0, Result.count - 1) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_8_string_8 (s) as l_ref and then ∀ n: Result ¦ n = l_ref.code (@ n.target_index + 1) end utf_32_string_to_utf_8_0 (s: READABLE_STRING_GENERAL): SPECIAL [NATURAL_8] -- UTF-8 sequence corresponding to `s', interpreted as a UTF-32 sequence. -- The sequence is zero-terminated. local m: INTEGER i, n: like {STRING_32}.count c: NATURAL_32 do n := s.count -- First compute how many bytes we need to convert `s' to UTF-8. m := utf_8_bytes_count (s, 1, n) -- Fill `Result' with the converted data. from create Result.make_filled (0, m + 1) i := 0 m := 0 until i >= n loop i := i + 1 c := s.code (i) if c <= 0x7F then -- 0xxxxxxx. Result.put (c.to_natural_8, m) m := m + 1 elseif c <= 0x7FF then -- 110xxxxx 10xxxxxx. Result.put (((c |>> 6) | 0xC0).to_natural_8, m) Result.put (((c & 0x3F) | 0x80).to_natural_8, m + 1) m := m + 2 elseif c <= 0xFFFF then -- 1110xxxx 10xxxxxx 10xxxxxx Result.put (((c |>> 12) | 0xE0).to_natural_8, m) Result.put ((((c |>> 6) & 0x3F) | 0x80).to_natural_8, m + 1) Result.put (((c & 0x3F) | 0x80).to_natural_8, m + 2) m := m + 3 else -- c <= 1FFFFF - there are no higher code points -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx Result.put (((c |>> 18) | 0xF0).to_natural_8, m) Result.put ((((c |>> 12) & 0x3F) | 0x80).to_natural_8, m + 1) Result.put ((((c |>> 6) & 0x3F) | 0x80).to_natural_8, m + 2) Result.put (((c & 0x3F) | 0x80).to_natural_8, m + 3) m := m + 4 end end Result.put (0, m) ensure instance_free: class attached_utf_8_string: attached utf_32_string_to_utf_8_string_8 (s) as l_ref count: Result.count = l_ref.count + 1 roundtrip: ∀ x: l_ref ¦ x = Result [@ x.target_index - 1].to_character_8 zero_terminated: Result [Result.upper] = 0 end feature -- UTF-8 to UTF-32 utf_8_0_pointer_to_escaped_string_32 (p: MANAGED_POINTER): STRING_32 -- {STRING_32} object corresponding to UTF-8 sequence `p' which is zero-terminated, -- where invalid UTF-8 sequences are escaped. do -- Allocate Result with the same number of bytes as `p'. create Result.make (p.count) utf_8_0_pointer_into_escaped_string_32 (p, Result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_8_string_8 (Result) as s and then ∀ c: s ¦ c = p.read_natural_8 (@ c.target_index - 1).to_character_8 end utf_8_0_pointer_into_escaped_string_32 (p: MANAGED_POINTER; a_result: STRING_32) -- Copy {STRING_32} object corresponding to UTF-8 sequence `p' which is zero-terminated, -- where invalid UTF-8 sequences are escaped, appended into `a_result'. do utf_8_0_subpointer_into_escaped_string_32 (p, 0, p.count - 1, True, a_result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_8_string_8 (a_result.substring (old a_result.count + 1, a_result.count)) as s and then ∀ c: s ¦ c = p.read_natural_8 (@ c.target_index - 1).to_character_8 end utf_8_0_subpointer_to_escaped_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER; a_stop_at_null: BOOLEAN): STRING_32 -- {STRING_32} object corresponding to UTF-8 sequence `p' between indexes `start_pos' and -- `end_pos' or the first null character encountered if `a_stop_at_null', where invalid -- UTF-8 sequences are escaped. require start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count do -- Allocate Result with the same number of bytes as `p'. create Result.make (p.count) utf_8_0_subpointer_into_escaped_string_32 (p, start_pos, end_pos, a_stop_at_null, Result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_8_string_8 (Result) as s and then ∀ c: s ¦ c = p.read_natural_8 (start_pos + @ c.target_index - 1).to_character_8 end utf_8_0_subpointer_into_escaped_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER; a_stop_at_null: BOOLEAN; a_result: STRING_32) -- Copy {STRING_32} object corresponding to UTF-8 sequence `p' between indexes `start_pos' and -- `end_pos' or the first null character encountered if `a_stop_at_null', where invalid -- UTF-8 sequences are escaped, appended into `a_result'. require start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count local i: like {STRING_8}.count c1, c2, c3, c4: NATURAL_8 l_last_char: CHARACTER_32 do from a_result.grow (a_result.count + end_pos - start_pos + 1) i := start_pos until i > end_pos loop c1 := p.read_natural_8 (i) if c1 = 0 and a_stop_at_null then -- We hit our null terminating character, we can stop i := end_pos + 1 elseif c1 <= 0x7F then -- 0xxxxxxx a_result.extend (c1.to_character_32) i := i + 1 elseif (c1 & 0xE0) = 0xC0 then if i < end_pos then c2 := p.read_natural_8 (i + 1) if (c2 & 0xC0) = 0x80 then -- Valid UTF-8 sequence: -- 110xxxxx 10xxxxxx a_result.extend (( ((c1.as_natural_32 & 0x1F) |<< 6) | (c2.as_natural_32 & 0x3F) ).to_character_32) i := i + 2 else -- Invalid UTF-8 sequence, we escape the first byte -- and try with the next one to see if it is the starting -- byte of a valid UTF-8 sequence. escape_code_into (a_result, c1) i := i + 1 end else -- Invalid UTF-8 sequence, we escape the first byte. escape_code_into (a_result, c1) i := i + 1 end elseif (c1 & 0xF0) = 0xE0 then if i + 1 < end_pos then c2 := p.read_natural_8 (i + 1) c3 := p.read_natural_8 (i + 2) if (c2 & 0xC0) = 0x80 and (c3 & 0xC0) = 0x80 then -- Valid UTF-8 sequence: -- 1110xxxx 10xxxxxx 10xxxxxx l_last_char := (((c1.as_natural_32 & 0xF) |<< 12) | ((c2.as_natural_32 & 0x3F) |<< 6) | (c3.as_natural_32 & 0x3F) ).to_character_32 a_result.extend (l_last_char) i := i + 3 else -- Invalid UTF-8 sequence, we escape the first byte -- and try with the next one to see if it is the starting -- byte of a valid UTF-8 sequence. escape_code_into (a_result, c1) i := i + 1 end else -- Invalid UTF-8 sequence. escape_code_into (a_result, c1) i := i + 1 end elseif (c1 & 0xF8) = 0xF0 then if i + 2 < end_pos then c2 := p.read_natural_8 (i + 1) c3 := p.read_natural_8 (i + 2) c4 := p.read_natural_8 (i + 3) if (c2 & 0xC0) = 0x80 and (c3 & 0xC0) = 0x80 and (c4 & 0xC0) = 0x80 then -- Valid UTF-8 sequence: -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx a_result.extend (( ((c1.as_natural_32 & 0x7) |<< 18) | ((c2.as_natural_32 & 0x3F) |<< 12) | ((c3.as_natural_32 & 0x3F) |<< 6) | (c4.as_natural_32 & 0x3F) ).to_character_32) i := i + 4 else -- Invalid UTF-8 sequence, we escape the first byte -- and try with the next one to see if it is the starting -- byte of a valid UTF-8 sequence. escape_code_into (a_result, c1) i := i + 1 end else -- Invalid UTF-8 sequence. escape_code_into (a_result, c1) i := i + 1 end else -- Clearly invalid UTF-8 escape_code_into (a_result, c1) i := i + 1 end end ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_8_string_8 (a_result.substring (old a_result.count + 1, a_result.count)) as s and then ∀ c: s ¦ c = p.read_natural_8 (start_pos + @ c.target_index - 1).to_character_8 end utf_8_string_8_to_string_32 (s: READABLE_STRING_8): STRING_32 -- STRING_32 corresponding to UTF-8 sequence `s'. do create Result.make (s.count) utf_8_string_8_into_string_32 (s, Result) ensure instance_free: class roundtrip: is_valid_utf_8_string_8 (s) implies utf_32_string_to_utf_8_string_8 (Result).same_string (s) end utf_8_string_8_into_string_32 (s: READABLE_STRING_8; a_result: STRING_32) -- Copy STRING_32 corresponding to UTF-8 sequence `s' appended into `a_result'. local i: like {STRING_8}.count n: like {STRING_8}.count c: NATURAL_32 do from n := s.count a_result.grow (a_result.count + n) until i >= n loop i := i + 1 c := s.code (i) if c <= 0x7F then -- 0xxxxxxx a_result.extend (c.to_character_32) elseif c <= 0xDF then -- 110xxxxx 10xxxxxx i := i + 1 if i <= n then a_result.extend (( ((c & 0x1F) |<< 6) | (s.code (i) & 0x3F) ).to_character_32) end elseif c <= 0xEF then -- 1110xxxx 10xxxxxx 10xxxxxx i := i + 2 if i <= n then a_result.extend (( ((c & 0xF) |<< 12) | ((s.code (i - 1) & 0x3F) |<< 6) | (s.code (i) & 0x3F) ).to_character_32) end elseif c <= 0xF7 then -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx i := i + 3 if i <= n then a_result.extend (( ((c & 0x7) |<< 18) | ((s.code (i - 2) & 0x3F) |<< 12) | ((s.code (i - 1) & 0x3F) |<< 6) | (s.code (i) & 0x3F) ).to_character_32) end end end ensure instance_free: class roundtrip: is_valid_utf_8_string_8 (s) implies utf_32_string_to_utf_8_string_8 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end utf_8_string_8_to_escaped_string_32 (s: READABLE_STRING_8): STRING_32 -- STRING_32 corresponding to UTF-8 sequence `s', where invalid UTF-8 sequences are escaped. do create Result.make (s.count) utf_8_string_8_into_escaped_string_32 (s, Result) ensure instance_free: class roundtrip: escaped_utf_32_string_to_utf_8_string_8 (Result).same_string (s) end utf_8_string_8_into_escaped_string_32 (s: READABLE_STRING_8; a_result: STRING_32) -- Copy STRING_32 corresponding to UTF-8 sequence `s', where invalid UTF-8 sequences are escaped, -- appended into `a_result'. local i: like {STRING_8}.count n: like {STRING_8}.count c1, c2, c3, c4: NATURAL_8 l_last_char: CHARACTER_32 do from n := s.count a_result.grow (a_result.count + n) until i >= n loop i := i + 1 c1 := s.code (i).as_natural_8 if c1 <= 0x7F then -- 0xxxxxxx a_result.extend (c1.to_character_32) elseif (c1 & 0xE0) = 0xC0 then if i < n then c2 := s.code (i + 1).as_natural_8 if (c2 & 0xC0) = 0x80 then -- Valid UTF-8 sequence: -- 110xxxxx 10xxxxxx a_result.extend (( ((c1.as_natural_32 & 0x1F) |<< 6) | (c2.as_natural_32 & 0x3F) ).to_character_32) i := i + 1 else -- Invalid UTF-8 sequence, we escape the first byte -- and try with the next one to see if it is the starting -- byte of a valid UTF-8 sequence. escape_code_into (a_result, c1) end else -- Invalid UTF-8 sequence, we escape the first byte. escape_code_into (a_result, c1) end elseif (c1 & 0xF0) = 0xE0 then if i + 1 < n then c2 := s.code (i + 1).as_natural_8 c3 := s.code (i + 2).as_natural_8 if (c2 & 0xC0) = 0x80 and (c3 & 0xC0) = 0x80 then -- Valid UTF-8 sequence: -- 1110xxxx 10xxxxxx 10xxxxxx l_last_char := (((c1.as_natural_32 & 0xF) |<< 12) | ((c2.as_natural_32 & 0x3F) |<< 6) | (c3.as_natural_32 & 0x3F) ).to_character_32 a_result.extend (l_last_char) i := i + 2 else -- Invalid UTF-8 sequence, we escape the first byte -- and try with the next one to see if it is the starting -- byte of a valid UTF-8 sequence. escape_code_into (a_result, c1) end else -- Invalid UTF-8 sequence. escape_code_into (a_result, c1) end elseif (c1 & 0xF8) = 0xF0 then if i + 2 < n then c2 := s.code (i + 1).as_natural_8 c3 := s.code (i + 2).as_natural_8 c4 := s.code (i + 3).as_natural_8 if (c2 & 0xC0) = 0x80 and (c3 & 0xC0) = 0x80 and (c4 & 0xC0) = 0x80 then -- Valid UTF-8 sequence: -- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx a_result.extend (( ((c1.as_natural_32 & 0x7) |<< 18) | ((c2.as_natural_32 & 0x3F) |<< 12) | ((c3.as_natural_32 & 0x3F) |<< 6) | (c4.as_natural_32 & 0x3F) ).to_character_32) i := i + 3 else -- Invalid UTF-8 sequence, we escape the first byte -- and try with the next one to see if it is the starting -- byte of a valid UTF-8 sequence. escape_code_into (a_result, c1) end else -- Invalid UTF-8 sequence. escape_code_into (a_result, c1) end else -- Clearly invalid UTF-8 escape_code_into (a_result, c1) end end ensure instance_free: class roundtrip: escaped_utf_32_string_to_utf_8_string_8 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end feature -- UTF-32 to UTF-16 string_32_to_utf_16 (s: READABLE_STRING_32): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to `s'. -- The sequence is not zero-terminated. do Result := utf_32_string_to_utf_16 (s) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_16le_string_8 (s) as l_ref and then ∀ n: Result ¦ n = (l_ref.code (@ n.target_index * 2 + 1) | (l_ref.code ((@ n.target_index + 1) * 2) |<< 16)) end utf_32_string_to_utf_16 (s: READABLE_STRING_GENERAL): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to `s' interpreted as a UTF-32 sequence. -- The sequence is not zero-terminated. do Result := utf_32_string_to_utf_16_0 (s) Result := Result.aliased_resized_area_with_default (0, Result.count - 1) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_16le_string_8 (s) as l_ref and then ∀ n: Result ¦ n = (l_ref.code (@ n.target_index * 2 + 1) | (l_ref.code ((@ n.target_index + 1) * 2) |<< 8)) end string_32_to_utf_16_0 (s: READABLE_STRING_32): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to `s' with terminating zero. do Result := utf_32_string_to_utf_16_0 (s) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_16le_string_8 (s) as l_ref and then ∀ n: Result.resized_area_with_default (0, Result.count - 1) ¦ n = (l_ref.code (@ n.target_index * 2 + 1) | ((l_ref.code ((@ n.target_index + 1) * 2)) |<< 8)) end utf_32_string_to_utf_16_0 (s: READABLE_STRING_GENERAL): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to `s', interpreted as a UTF-32 sequence, -- with terminating zero. local i: like {STRING_32}.count n: like {STRING_32}.count m: like {STRING_32}.count p: like {STRING_32}.count c: NATURAL_32 do from m := 0 n := s.count p := n create Result.make_empty (p + 1) invariant m = Result.count p + 1 = Result.capacity until i >= n loop i := i + 1 -- Make sure there is sufficient room for at least 2 code units. if p < m + 2 then p := m + (n - i) + 2 Result := Result.aliased_resized_area (p + 1) end c := s.code (i) if c <= 0xFFFF then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. Result.extend (c.to_natural_16) m := m + 1 else -- Supplementary Planes: surrogate pair with lead and trail surrogates. Result.extend ((0xD7C0 + (c |>> 10)).to_natural_16) Result.extend ((0xDC00 + (c & 0x3FF)).to_natural_16) m := m + 2 end end Result.extend (0) ensure instance_free: class roundtrip: attached utf_32_string_to_utf_16le_string_8 (s) as l_ref and then ∀ x: Result.resized_area_with_default (0, Result.count - 1) ¦ x = (l_ref.code (@ x.target_index * 2 + 1) | ((l_ref.code ((@ x.target_index + 1) * 2)) |<< 8)) end string_32_into_utf_16_pointer (s: READABLE_STRING_32; p: MANAGED_POINTER; p_offset: INTEGER; a_new_upper: detachable CELL [INTEGER]) -- Write UTF-16 sequence corresponding to `s' to address `p + p_offset' -- and update the size of `p' to the number of written bytes. -- If `a_new_upper' is provided, the upper index of `p' containing the zero-termination -- is written to `a_new_upper'. -- The sequence is not zero-terminated. require even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 do utf_32_substring_into_utf_16_pointer (s, 1, s.count, p, p_offset, a_new_upper) ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string (s) roundtrip: (a_new_upper = Void and then not s.has ('%U')) implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string (s) end string_32_into_utf_16_0_pointer (s: READABLE_STRING_32; p: MANAGED_POINTER; p_offset: INTEGER; a_new_upper: detachable CELL [INTEGER]) -- Write UTF-16 sequence corresponding to `s' with terminating zero -- to address `p + p_offset' and update the size of `p' to the number of written bytes. -- If `a_new_upper' is provided, the upper index of `p' containing the zero-termination -- is written to `a_new_upper'. -- The sequence is zero-terminated. require even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 do utf_32_substring_into_utf_16_0_pointer (s, 1, s.count, p, p_offset, a_new_upper) ensure instance_free: class roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string (s) roundtrip: (a_new_upper = Void and then not s.has ('%U')) implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string (s) end utf_32_substring_into_utf_16_pointer (s: READABLE_STRING_GENERAL; start_pos, end_pos: like {READABLE_STRING_32}.count; p: MANAGED_POINTER; p_offset: INTEGER; a_new_upper: detachable CELL [INTEGER]) -- Write UTF-16 sequence corresponding to the substring of `s', -- interpreted as a UTF-32 sequence, starting at index `start_pos' -- and ending at index `end_pos' to address `p + p_offset' and update the -- size of `p' to the number of written bytes. -- If `a_new_upper' is provided, the upper index of `p' containing the zero-termination -- is written to `a_new_upper'. -- The sequence is not zero-terminated. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 local m: INTEGER do m := p.count utf_32_substring_into_utf_16_0_pointer (s, start_pos, end_pos, p, p_offset, a_new_upper) if m < p.count then -- Remove the null terminating character. p.resize (p.count - 2) if a_new_upper /= Void then a_new_upper.put (p.count - 2) end end ensure instance_free: class p_count_may_increase: p.count >= old p.count roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string_general (s) roundtrip: (a_new_upper = Void and then not s.has ('%U')) implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string_general (s) end utf_32_substring_into_utf_16_0_pointer (s: READABLE_STRING_GENERAL; start_pos, end_pos: like {READABLE_STRING_32}.count; p: MANAGED_POINTER; p_offset: INTEGER; a_new_upper: detachable CELL [INTEGER]) -- Write UTF-16 sequence corresponding to the substring of `s', -- interpreted as a UTF-32 sequence, starting at index `start_pos' -- and ending at index `end_pos' to address `p + p_offset' and update the -- size of `p' to the number of written bytes. -- If `a_new_upper' is provided, the upper index of `p' containing the zero-termination -- is written to `a_new_upper'. -- The sequence is zero-terminated. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 local i: like {READABLE_STRING_GENERAL}.count c: NATURAL_32 m, l_count: like {MANAGED_POINTER}.count l_resized: BOOLEAN do -- Write UTF-16 sequence. from i := end_pos - start_pos + 1 l_count := p.count -- Check that there is at least `i * 2' bytes available plus the terminating null character. if l_count - p_offset < (i + 1) * 2 then -- Optimize resizing, once we have to resize, we actually perform the resizing -- only once. l_count := p_offset + utf_16_bytes_count (s, start_pos, end_pos) + 2 p.resize (l_count) l_resized := True end i := start_pos - 1 m := p_offset until i >= end_pos loop i := i + 1 c := s.code (i) if c <= 0xFFFF then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. p.put_natural_16 (c.to_natural_16, m) m := m + 2 else -- Make sure there is sufficient room for all the remaining characters and -- at least 3 code units of 2 bytes each, i.e. 2 code unit for the surrogate -- pair, and one unit for the terminating null character. Note that we do not -- take into account `p_offset' because `m' already includes it. -- Note that `end_pos - i' represents the number of remaining characters -- to process in the current string. if not l_resized and then (m + 6 + (end_pos - i) * 2 > l_count) then -- Optimize resizing, once we have to resize, we actually perform the resizing -- only once. l_count := m + utf_16_bytes_count (s, i, end_pos) + 2 p.resize (l_count) l_resized := True end -- Supplementary Planes: surrogate pair with lead and trail surrogates. p.put_natural_16 ((0xD7C0 + (c |>> 10)).to_natural_16, m) p.put_natural_16 ((0xDC00 + (c & 0x3FF)).to_natural_16, m + 2) m := m + 4 end end -- Adjust number of written bytes and add terminating zero at the end. if l_resized then -- We had to add a code unit on 4 bytes. We adjust the size. p.resize (m + 2) end p.put_natural_16 (0, m) if a_new_upper /= Void then a_new_upper.put (m) end ensure instance_free: class p_count_may_increase: p.count >= old p.count roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string_general (s) roundtrip: (a_new_upper = Void and then not s.has ('%U')) implies utf_16_0_subpointer_to_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string_general (s) end utf_32_string_to_utf_16le_string_8 (s: READABLE_STRING_GENERAL): STRING_8 -- UTF-16LE sequence corresponding to `s' interpreted as a UTF-32 sequence do -- We would need at least 2-bytes per characters in `s'. create Result.make (s.count * 2) utf_32_string_into_utf_16le_string_8 (s, Result) ensure instance_free: class roundtrip: utf_16le_string_8_to_string_32 (Result).same_string_general (s) end utf_32_string_into_utf_16le_string_8 (s: READABLE_STRING_GENERAL; a_result: STRING_8) -- Copy UTF-16LE sequence corresponding to `s' interpreted as a UTF-32 sequence -- appended into `a_result'. local i: like {STRING_32}.count n: like {STRING_32}.count c: NATURAL_32 l_nat16: NATURAL_16 do from n := s.count -- We would need at least 2-bytes per characters in `s'. a_result.grow (a_result.count + n * 2) until i >= n loop i := i + 1 c := s.code (i) if c <= 0xFFFF then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. a_result.extend ((c & 0x00FF).to_character_8) a_result.extend (((c & 0xFF00) |>> 8).to_character_8) else -- Write the lead surrogate pair. l_nat16 := (0xD7C0 + (c |>> 10)).to_natural_16 a_result.extend ((l_nat16 & 0x00FF).to_character_8) a_result.extend (((l_nat16 & 0xFF00) |>> 8).to_character_8) -- Write the trail surrogate pair. l_nat16 := (0xDC00 + (c & 0x3FF)).to_natural_16 a_result.extend ((l_nat16 & 0x00FF).to_character_8) a_result.extend (((l_nat16 & 0xFF00) |>> 8).to_character_8) end end ensure instance_free: class roundtrip: utf_16le_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string_general (s) end escaped_utf_32_substring_into_utf_16_0_pointer ( s: READABLE_STRING_GENERAL; start_pos, end_pos: like {READABLE_STRING_32}.count; p: MANAGED_POINTER; p_offset: INTEGER; a_new_upper: detachable CELL [INTEGER] ) -- Write UTF-16 sequence corresponding to the substring of `s', -- interpreted as a UTF-32 sequence, starting at index `start_pos' -- and ending at index `end_pos' to address `p + p_offset' and update the -- size of `p' to the number of written bytes. -- If `a_new_upper' is provided, the upper index of `p' containing the zero-termination -- is written to `a_new_upper'. -- The sequence is not zero-terminated. require start_position_big_enough: start_pos >= 1 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos <= s.count even_p_offset: (p_offset \\ 2) = 0 p_offset_non_negative: p_offset >= 0 local i, n, m, l_count: INTEGER c: NATURAL_32 l_encoded_value: READABLE_STRING_GENERAL l_decoded: BOOLEAN l_resized: BOOLEAN do from n := end_pos - start_pos + 1 l_count := p.count -- Check that there is at least `i * 2' bytes available plus the terminating null character. if l_count - p_offset < (n + 1) * 2 then -- Optimize resizing, once we have to resize, we actually perform the resizing -- only once. l_count := p_offset + utf_16_bytes_count (s, start_pos, end_pos) + 2 p.resize (l_count) l_resized := True end i := start_pos - 1 m := p_offset until i >= end_pos loop i := i + 1 c := s.code (i) if c = escape_character.natural_32_code then -- We might be facing a character that was escaped. if i < n then if s.item (i + 1) = escape_character then -- The `escape_character' was escaped, it meant they really wanted an `escape_character'. i := i + 1 elseif s.item (i + 1) = 'u' then if i + 4 < n then l_encoded_value := s.substring (i + 2, i + 5) if is_hexa_decimal (l_encoded_value) then c := to_natural_32 (l_encoded_value) if c < 0xD800 or c > 0xDFFF then -- Value was encoded when it should not have been -- do nothing, we leave the original content as is. c := escape_character.natural_32_code else l_decoded := True i := i + 5 end else -- Not an hexadecimal value, it was not escaped. end else -- Not enough characters to make a 2-byte value, it was not escaped. end else -- Value was most likely not encoded, because if it did, it would be the -- hexadecimal representation of a byte which clearly did not need to -- be escaped end else -- Nothing more to read, clearly it was not encoded. end end if not l_decoded then if c <= 0xFFFF then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. p.put_natural_16 (c.to_natural_16, m) m := m + 2 else -- Make sure there is sufficient room for all the remaining characters and -- at least 3 code units of 2 bytes each, i.e. 2 code unit for the surrogate -- pair, and one unit for the terminating null character. Note that we do not -- take into account `p_offset' because `m' already includes it. -- Note that `end_pos - i' represents the number of remaining characters -- to process in the current string. if not l_resized and then (m + 6 + (end_pos - i) * 2 > l_count) then -- Optimize resizing, once we have to resize, we actually perform the resizing -- only once. l_count := m + utf_16_bytes_count (s, i, end_pos) + 2 p.resize (l_count) l_resized := True end -- Write the lead surrogate pair. p.put_natural_16 ((0xD7C0 + (c |>> 10)).to_natural_16, m) -- Write the trail surrogate pair. p.put_natural_16 ((0xDC00 + (c & 0x3FF)).to_natural_16, m + 2) m := m + 4 end else l_decoded := False -- Simply put decoded value directly in stream. p.put_natural_16 (c.to_natural_16, m) m := m + 2 end end if l_resized then -- We had to add a code unit on 4 bytes. We adjust the size. p.resize (m + 2) end p.put_natural_16 (0, m) if a_new_upper /= Void then a_new_upper.put (m) end ensure instance_free: class p_count_may_increase: p.count >= old p.count roundtrip: a_new_upper /= Void implies utf_16_0_subpointer_to_escaped_string_32 (p, p_offset // 2, (a_new_upper.item // 2) - 1, False).same_string_general (s.substring (start_pos, end_pos)) roundtrip: (a_new_upper = Void and then not s.substring (start_pos, end_pos).has ('%U')) implies utf_16_0_subpointer_to_escaped_string_32 (p, p_offset // 2, (p.count // 2) - 1, True).same_string_general (s.substring (start_pos, end_pos)) end escaped_utf_32_string_to_utf_16le_string_8 (s: READABLE_STRING_GENERAL): STRING_8 -- UTF-16LE sequence corresponding to `s' interpreted as a UTF-32 sequence that could be escaped. -- If `s' contains the `escape_character' followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then `s' has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. do -- We would need at least 2-bytes per characters in `s'. create Result.make (s.count * 2) escaped_utf_32_string_into_utf_16le_string_8 (s, Result) ensure instance_free: class roundtrip: utf_16le_string_8_to_escaped_string_32 (Result).same_string_general (s) end escaped_utf_32_string_into_utf_16le_string_8 (s: READABLE_STRING_GENERAL; a_result: STRING_8) -- Copy UTF-16LE sequence corresponding to `s' interpreted as a UTF-32 sequence that could be -- escaped appended into `a_result'. -- If `s' contains the `escape_character' followed by either "HH" or "uHHHH" where H stands -- for an hexadecimal digit, then `s' has been escaped and will be converted to what is -- expected by the current platform. -- Otherwise it will be ignored and it will be left as is. -- See the note clause for the class for more details on the encoding. local i: like {STRING_32}.count n: like {STRING_32}.count c: NATURAL_32 l_nat16: NATURAL_16 l_encoded_value: READABLE_STRING_GENERAL l_decoded: BOOLEAN do from n := s.count -- We would need at least 2-bytes per characters in `s'. a_result.grow (a_result.count + n * 2) until i >= n loop i := i + 1 c := s.code (i) if c = escape_character.natural_32_code then -- We might be facing a character that was escaped. if i < n then if s.item (i + 1) = escape_character then -- The `escape_character' was escaped, it meant they really wanted an `escape_character'. i := i + 1 elseif s.item (i + 1) = 'u' then if i + 4 < n then l_encoded_value := s.substring (i + 2, i + 5) if is_hexa_decimal (l_encoded_value) then c := to_natural_32 (l_encoded_value) if c < 0xD800 or c > 0xDFFF then -- Value was encoded when it should not have been -- do nothing, we leave the original content as is. c := escape_character.natural_32_code else l_decoded := True i := i + 5 end else -- Not an hexadecimal value, it was not escaped. end else -- Not enough characters to make a 2-byte value, it was not escaped. end else -- Value was most likely not encoded, because if it did, it would be the -- hexadecimal representation of a byte which clearly did not need to -- be escaped end else -- Nothing more to read, clearly it was not encoded. end end if not l_decoded then if c <= 0xFFFF then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. a_result.extend ((c & 0x00FF).to_character_8) a_result.extend (((c & 0xFF00) |>> 8).to_character_8) else -- Write the lead surrogate pair. l_nat16 := (0xD7C0 + (c |>> 10)).to_natural_16 a_result.extend ((l_nat16 & 0x00FF).to_character_8) a_result.extend (((l_nat16 & 0xFF00) |>> 8).to_character_8) -- Write the trail surrogate pair. l_nat16 := (0xDC00 + (c & 0x3FF)).to_natural_16 a_result.extend ((l_nat16 & 0x00FF).to_character_8) a_result.extend (((l_nat16 & 0xFF00) |>> 8).to_character_8) end else l_decoded := False -- Simply put decoded value directly in stream. a_result.extend ((c & 0x00FF).to_character_8) a_result.extend (((c & 0xFF00) |>> 8).to_character_8) end end ensure instance_free: class roundtrip: utf_16le_string_8_to_escaped_string_32 (a_result.substring (old a_result.count + 1, a_result.count)).same_string_general (s) end feature -- UTF-16 to UTF-32 utf_16_0_pointer_to_string_32 (p: MANAGED_POINTER): STRING_32 -- {STRING_32} object corresponding to UTF-16 sequence `p' which is zero-terminated. require minimum_size: p.count >= 2 valid_count: p.count \\ 2 = 0 do -- Allocate Result with the same number of bytes as `p'. create Result.make (p.count) utf_16_0_pointer_into_string_32 (p, Result) ensure instance_free: class roundtrip: is_valid_utf_16_subpointer (p, 0, p.count // 2, True) implies ∀ n: string_32_to_utf_16 (Result) ¦ n = p.read_natural_16 ((@ n.target_index + 1) * 2) end utf_16_0_pointer_into_string_32 (p: MANAGED_POINTER; a_result: STRING_32) -- Copy {STRING_32} object corresponding to UTF-16 sequence `p' which is zero-terminated -- appended into `a_result'. require minimum_size: p.count >= 2 valid_count: p.count \\ 2 = 0 do utf_16_0_subpointer_into_string_32 (p, 0, p.count // 2 - 1, True, a_result) ensure instance_free: class roundtrip: is_valid_utf_16_subpointer (p, 0, p.count // 2, True) implies ∀ n: string_32_to_utf_16 (a_result.substring (old a_result.count + 1, a_result.count)) ¦ n = p.read_natural_16 (@ n.target_index * 2) end utf_16_0_subpointer_to_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER; a_stop_at_null: BOOLEAN): STRING_32 -- {STRING_32} object corresponding to UTF-16 sequence `p' between code units `start_pos' and -- `end_pos' or the first null character encountered if `a_stop_at_null'. require minimum_size: p.count >= 2 start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count // 2 do create Result.make (p.count) utf_16_0_subpointer_into_string_32 (p, start_pos, end_pos, a_stop_at_null, Result) ensure instance_free: class roundtrip: is_valid_utf_16_subpointer (p, start_pos, end_pos, a_stop_at_null) implies ∀ n: string_32_to_utf_16 (Result) ¦ n = p.read_natural_16 (@ n.target_index * 2) end utf_16_0_subpointer_into_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER; a_stop_at_null: BOOLEAN; a_result: STRING_32) -- Copy {STRING_32} object corresponding to UTF-16 sequence `p' between code units `start_pos' and -- `end_pos' or the first null character encountered if `a_stop_at_null' appended into `a_result'. require minimum_size: p.count >= 2 start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count // 2 local i, n: INTEGER c: NATURAL_32 do from -- Allocate Result with the same number of bytes as copied from `p'. a_result.grow (a_result.count + end_pos - start_pos + 1) i := start_pos * 2 n := end_pos * 2 until i > n loop c := p.read_natural_16 (i) if c = 0 and a_stop_at_null then -- We hit our null terminating character, we can stop i := n + 1 else i := i + 2 if c < 0xD800 or c >= 0xE000 then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. a_result.extend (c.to_character_32) else -- Supplementary Planes: surrogate pair with lead and trail surrogates. if i <= n then a_result.extend (((c.as_natural_32 |<< 10) + p.read_natural_16 (i) - 0x35FDC00).to_character_32) i := i + 2 end end end end ensure instance_free: class roundtrip: is_valid_utf_16_subpointer (p, start_pos, end_pos, a_stop_at_null) implies ∀ x: string_32_to_utf_16 (a_result.substring (old a_result.count + 1, a_result.count)) ¦ x = p.read_natural_16 (@ x.target_index * 2) end utf_16_0_pointer_to_escaped_string_32 (p: MANAGED_POINTER): STRING_32 -- {STRING_32} object corresponding to UTF-16 sequence `p' which is zero-terminated, -- where invalid UTF-16LE sequences are escaped. require minimum_size: p.count >= 2 valid_count: p.count \\ 2 = 0 do -- Allocate Result with the same number of bytes as `p'. create Result.make (p.count) utf_16_0_pointer_into_escaped_string_32 (p, Result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_16le_string_8 (Result) as l_utf and then ∀ c: l_utf.new_cursor.incremented (1) ¦ (c.natural_32_code | (l_utf.code (@ c.target_index + 1) |<< 8)) = p.read_natural_16 (@ c.target_index - 1) end utf_16_0_pointer_into_escaped_string_32 (p: MANAGED_POINTER; a_result: STRING_32) -- Copy {STRING_32} object corresponding to UTF-16 sequence `p' which is zero-terminated, -- where invalid UTF-16LE sequences are escaped, appended into `a_result'. require minimum_size: p.count >= 2 valid_count: p.count \\ 2 = 0 do utf_16_0_subpointer_into_escaped_string_32 (p, 0, p.count // 2 - 1, True, a_result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_16le_string_8 (a_result.substring (old a_result.count + 1, a_result.count)) as l_utf and then ∀ c: l_utf.new_cursor.incremented (1) ¦ (c.natural_32_code | (l_utf.code (@ c.target_index + 1) |<< 8)) = p.read_natural_16 (@ c.target_index - 1) end utf_16_0_subpointer_to_escaped_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER; a_stop_at_null: BOOLEAN): STRING_32 -- {STRING_32} object corresponding to UTF-16 sequence `p' between code units `start_pos' and -- `end_pos' or the first null character encountered if `a_stop_at_null', where invalid -- UTF-16LE sequences are escaped. require minimum_size: p.count >= 2 start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count // 2 do create Result.make (end_pos - start_pos + 1) utf_16_0_subpointer_into_escaped_string_32 (p, start_pos, end_pos, a_stop_at_null, Result) ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_16le_string_8 (Result) as l_utf and then ∀ c: l_utf.new_cursor.incremented (1) ¦ (c.natural_32_code | (l_utf.code (@ c.target_index + 1) |<< 8)) = p.read_natural_16 (start_pos * 2 + @ c.target_index - 1) end utf_16_0_subpointer_into_escaped_string_32 (p: MANAGED_POINTER; start_pos, end_pos: INTEGER; a_stop_at_null: BOOLEAN; a_result: STRING_32) -- Copy {STRING_32} object corresponding to UTF-16 sequence `p' between code units `start_pos' and -- `end_pos' or the first null character encountered if `a_stop_at_null', where invalid -- UTF-16LE sequences are escaped, appended into `a_result'. require minimum_size: p.count >= 2 start_position_big_enough: start_pos >= 0 end_position_big_enough: start_pos <= end_pos + 1 end_pos_small_enough: end_pos < p.count // 2 local i, n: INTEGER c1, c2: NATURAL_32 do from -- Allocate Result with the same number of bytes as copied from `p'. a_result.grow (a_result.count + end_pos - start_pos + 1) i := start_pos * 2 n := end_pos * 2 until i > n loop c1 := p.read_natural_16 (i) if c1 = 0 and a_stop_at_null then -- We hit our null terminating character, we can stop i := n + 1 else i := i + 2 if c1 < 0xD800 or c1 >= 0xE000 then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. a_result.extend (c1.to_character_32) elseif c1 <= 0xDBFF and then i <= n then -- Check if a lead surrogate (value between 0xD800 and 0xDBFF) is followed by a trail surrogate. c2 := p.read_natural_16 (i) if c2 >= 0xDC00 and c2 <= 0xDFFF then -- Supplementary Planes: surrogate pair with lead and trail surrogates. a_result.extend (((c1 |<< 10) + c2 - 0x35FDC00).to_character_32) i := i + 2 else -- Escape a lead surrogate not followed by a trail one. escape_code_into (a_result, c1.as_natural_16) end else -- Escape a trail surrogate not following a lead one or -- a lead surrogate not followed by a trail one. escape_code_into (a_result, c1.as_natural_16) end end end ensure instance_free: class roundtrip: attached escaped_utf_32_string_to_utf_16le_string_8 (a_result.substring (old a_result.count + 1, a_result.count)) as l_utf and then ∀ c: l_utf.new_cursor.incremented (1) ¦ (c.natural_32_code | (l_utf.code (@ c.target_index + 1) |<< 8)) = p.read_natural_16 (start_pos * 2 + @ c.target_index - 1) end utf_16_to_string_32 (s: SPECIAL [NATURAL_16]): STRING_32 -- {STRING_32} object corresponding to UTF-16 sequence `s'. do create Result.make (s.count) utf_16_into_string_32 (s, Result) ensure instance_free: class roundtrip: is_valid_utf_16 (s) implies string_32_to_utf_16 (Result).is_equal (s) end utf_16_into_string_32 (s: SPECIAL [NATURAL_16]; a_result: STRING_32) -- Copy {STRING_32} object corresponding to UTF-16 sequence `s' -- appended into `a_result'. local i: like {SPECIAL [NATURAL_16]}.count n: like {SPECIAL [NATURAL_16]}.count c: NATURAL_32 do from n := s.count a_result.grow (a_result.count + n) until i >= n loop c := s [i] i := i + 1 if c < 0xD800 or c >= 0xE000 then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. a_result.extend (c.to_character_32) else -- Supplementary Planes: surrogate pair with lead and trail surrogates. if i < n then a_result.extend (((c |<< 10) + s [i] - 0x35FDC00).to_character_32) i := i + 1 end end end ensure instance_free: class roundtrip: is_valid_utf_16 (s) implies string_32_to_utf_16 (a_result.substring (old a_result.count + 1, a_result.count)).is_equal (s) end utf_16le_string_8_to_string_32 (s: READABLE_STRING_8): STRING_32 -- {STRING_32} object corresponding to UTF-16LE sequence `s'. do -- There is at least half the characters of `s'. create Result.make (s.count |>> 1) utf_16le_string_8_into_string_32 (s, Result) ensure instance_free: class roundtrip: is_valid_utf_16le_string_8 (s) implies escaped_utf_32_string_to_utf_16le_string_8 (Result).same_string (s) end utf_16le_string_8_into_string_32 (s: READABLE_STRING_8; a_result: STRING_32) -- Copy {STRING_32} object corresponding to UTF-16LE sequence `s' appended into `a_result'. local i, nb: INTEGER c1, c2: NATURAL_32 do from nb := s.count -- There is at least half the characters of `s'. a_result.grow (a_result.count + (nb |>> 1)) until i + 1 >= nb loop i := i + 2 -- Extract the first 2-bytes c1 := s.code (i - 1) | (s.code (i) |<< 8) if c1 < 0xD800 or c1 >= 0xE000 then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit, this is valid Unicode. a_result.extend (c1.to_character_32) else i := i + 2 if i <= nb then c2 := s.code (i - 1) | (s.code (i) |<< 8) a_result.extend (((c1 |<< 10) + c2 - 0x35FDC00).to_character_32) end end end ensure instance_free: class roundtrip: is_valid_utf_16le_string_8 (s) implies escaped_utf_32_string_to_utf_16le_string_8 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end utf_16le_string_8_to_escaped_string_32 (s: READABLE_STRING_8): STRING_32 -- {STRING_32} object corresponding to UTF-16LE sequence `s', where invalid UTF-16LE -- sequences are escaped. do -- There is at least half the characters of `s'. create Result.make (s.count |>> 1) utf_16le_string_8_into_escaped_string_32 (s, Result) ensure instance_free: class roundtrip: escaped_utf_32_string_to_utf_16le_string_8 (Result).same_string (s) end utf_16le_string_8_into_escaped_string_32 (s: READABLE_STRING_8; a_result: STRING_32) -- Copy {STRING_32} object corresponding to UTF-16LE sequence `s', where invalid UTF-16LE -- sequences are escaped, appended into `a_result'. local i, nb: INTEGER c1, c2: NATURAL_32 do from nb := s.count -- There is at least half the characters of `s'. a_result.grow (a_result.count + (nb |>> 1)) until i + 1 >= nb loop i := i + 2 -- Extract the first 2-bytes c1 := s.code (i - 1) | (s.code (i) |<< 8) if c1 < 0xD800 or c1 >= 0xE000 then -- Codepoint from Basic Multilingual Plane: one 16-bit code unit. a_result.extend (c1.to_character_32) elseif c1 <= 0xDBFF and i + 2 <= nb then -- Check if a lead surrogate is followed by a trail surrogate. c2 := s.code (i + 1) | (s.code (i + 2) |<< 8) if c2 >= 0xDC00 and c2 <= 0xDFFF then -- Supplementary Planes: surrogate pair with lead and trail surrogates. a_result.extend (((c1 |<< 10) + c2 - 0x35FDC00).to_character_32) i := i + 2 else -- Escape a lead surrogate not followed by a trail one. escape_code_into (a_result, c1.as_natural_16) end else -- Escape a trail surrogate not following a lead one or -- a lead surrogate not followed by a trail one. escape_code_into (a_result, c1.as_natural_16) end end ensure instance_free: class roundtrip: escaped_utf_32_string_to_utf_16le_string_8 (a_result.substring (old a_result.count + 1, a_result.count)).same_string (s) end feature -- UTF-16 to UTF-8 utf_16_to_utf_8_string_8 (s: SPECIAL [NATURAL_16]): STRING_8 -- UTF-8 sequence corresponding to UTF-16 sequence `s'. do debug ("to_implement") (create {REFACTORING_HELPER}).to_implement ("Convert directly from UTF-16 to UTF-8.") end Result := string_32_to_utf_8_string_8 (utf_16_to_string_32 (s)) ensure instance_free: class roundtrip: is_valid_utf_16 (s) implies string_32_to_utf_16 (utf_8_string_8_to_string_32 (Result)).is_equal (s) end utf_16_into_utf_8_string_8 (s: SPECIAL [NATURAL_16]; a_result: STRING_8) -- Copy UTF-8 sequence corresponding to UTF-16 sequence `s' appended into `a_result'. do debug ("to_implement") (create {REFACTORING_HELPER}).to_implement ("Convert directly from UTF-16 to UTF-8.") end string_32_into_utf_8_string_8 (utf_16_to_string_32 (s), a_result) ensure instance_free: class roundtrip: is_valid_utf_16 (s) implies string_32_to_utf_16 (utf_8_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count))).is_equal (s) end utf_16le_string_8_to_utf_8_string_8 (s: READABLE_STRING_8): STRING_8 -- UTF-8 sequence corresponding to UTF-16LE sequence `s'. do create Result.make (s.count) utf_16le_string_8_into_utf_8_string_8 (s, Result) ensure instance_free: class roundtrip: is_valid_utf_16le_string_8 (s) implies utf_32_string_to_utf_16le_string_8 (utf_8_string_8_to_string_32 (Result)).same_string (s) end utf_16le_string_8_into_utf_8_string_8 (s: READABLE_STRING_8; a_result: STRING_8) -- Copy UTF-8 sequence corresponding to UTF-16LE sequence `s' appended into `a_result'. require even_count: (s.count & 1) = 0 local v: SPECIAL [NATURAL_16] i: like {STRING_8}.count n: like {STRING_8}.count do from n := s.count create v.make_empty (n |>> 1) until i >= n loop i := i + 2 check valid_index: 1 <= i - 1 and i <= s.count end v.extend (s [i - 1].code.as_natural_16 | (s [i].code.as_natural_16 |<< 8)) end utf_16_into_utf_8_string_8 (v, a_result) ensure instance_free: class roundtrip: is_valid_utf_16le_string_8 (s) implies utf_32_string_to_utf_16le_string_8 (utf_8_string_8_to_string_32 (a_result.substring (old a_result.count + 1, a_result.count))).same_string (s) end feature -- UTF-8 to UTF-16 utf_8_string_8_to_utf_16 (s: READABLE_STRING_8): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to UTF-8 sequence `s'. do debug ("to_implement") (create {REFACTORING_HELPER}).to_implement ("Convert directly from UTF-8 to UTF-16.") end Result := string_32_to_utf_16 (utf_8_string_8_to_string_32 (s)) ensure instance_free: class roundtrip: is_valid_utf_8_string_8 (s) implies utf_16_to_utf_8_string_8 (Result).same_string (s) end utf_8_string_8_to_utf_16_0 (s: READABLE_STRING_8): SPECIAL [NATURAL_16] -- UTF-16 sequence corresponding to UTF-8 sequence `s' with terminating zero. do Result := utf_8_string_8_to_utf_16 (s) Result := Result.aliased_resized_area_with_default (0, Result.count + 1) ensure instance_free: class roundtrip: is_valid_utf_8_string_8 (s) implies utf_16_to_utf_8_string_8 (Result).same_string (s) end feature -- Byte Order Mark (BOM) utf_8_bom_to_string_8: STRING_8 = "%/239/%/187/%/191/" -- UTF-8 BOM sequence. utf_16be_bom_to_string_8: STRING_8 = "%/254/%/255/" -- UTF-16BE BOM sequence. utf_16le_bom_to_string_8: STRING_8 = "%/255/%/254/" -- UTF-16LE BOM sequence. utf_32be_bom_to_string_8: STRING_8 = "%U%U%/254/%/255/" -- UTF-32BE BOM sequence. utf_32le_bom_to_string_8: STRING_8 = "%/255/%/254/%U%U" -- UTF-32LE BOM sequence. feature {NONE} -- Implementation escape_code_into (a_string: STRING_32; a_code: NATURAL_16) -- Escape `a_code' as documented in the note clause of the class into `a_string'. -- If `a_code' fits into a NATURAL_8, it will be just the `escape_character' followed -- by the 2-digit hexadecimal representation, otherwise `escape_character' followed -- by the letter `u' followed by the 4-digit hexadecimal representation. do a_string.append_character (escape_character) if a_code <= {NATURAL_8}.max_value then a_string.append_string_general (a_code.as_natural_8.to_hex_string) else a_string.append_character ('u') a_string.append_string_general (a_code.to_hex_string) end ensure instance_free: class end is_hexa_decimal (a_string: READABLE_STRING_GENERAL): BOOLEAN -- Is `a_string' a valid hexadecimal sequence? local l_convertor: like ctoi_convertor do l_convertor := ctoi_convertor l_convertor.reset ({NUMERIC_INFORMATION}.type_natural_32) l_convertor.parse_string_with_type (a_string, {NUMERIC_INFORMATION}.type_natural_32) Result := l_convertor.is_integral_integer ensure instance_free: class end to_natural_32 (a_hex_string: READABLE_STRING_GENERAL): NATURAL_32 -- Convert hexadecimal value `a_hex_string' to its corresponding NATURAL_32 value. require is_hexa: is_hexa_decimal (a_hex_string) local l_convertor: like ctoi_convertor do l_convertor := ctoi_convertor l_convertor.parse_string_with_type (a_hex_string, {NUMERIC_INFORMATION}.type_no_limitation) Result := l_convertor.parsed_natural_32 ensure instance_free: class end ctoi_convertor: HEXADECIMAL_STRING_TO_INTEGER_CONVERTER -- Convertor used to convert string to integer or natural once create Result.make Result.set_leading_separators_acceptable (False) Result.set_trailing_separators_acceptable (False) ensure instance_free: class ctoi_convertor_not_void: Result /= Void end note ca_ignore: "CA011", "CA011: too many arguments" copyright: "Copyright (c) 1984-2021, Eiffel Software and others" license: "Eiffel Forum License v2 (see" source: "[ Eiffel Software 5949 Hollister Ave., Goleta, CA 93117 USA Telephone 805-685-1006, Fax 805-685-6869 Website Customer support ]" end