indexing description: "UTF16LE implementation of UNICODE_CONVERTER" author: "Unicode Team" date: "$Date:$" revision: "$Revision:$" class UTF16LE_CONVERTER -- LITTLE ENDIAN without BOM inherit UNICODE_CONVERTER feature -- character converter encoded_to_wide_character (b: ARRAY[NATURAL_8]) : WIDE_CHARACTER is -- convert array of bytes `b' to a wide_character local b1, b2, b3, b4: NATURAL_8 -- [b1|b2] or [b1|b2|b3|b4] new_code, high_bytes, low_bytes: NATURAL_32 do if b.count = 2 then new_code := b.item (b.lower+1).to_natural_32.bit_shift_left (8) + b.item (b.lower).to_natural_32 else high_bytes := b.item (b.lower+1).to_natural_32.bit_shift_left (8) + b.item (b.lower).to_natural_32 -- 0..0 | 1101 10ZZ ZZxx xxxx low_bytes := b.item (b.lower+3).to_natural_32.bit_shift_left (8) + b.item (b.lower+2).to_natural_32 -- 0..0 | 1101 11yy yyyy yyyy high_bytes := high_bytes.bit_and (0x3FF) -- 0..0 | 0000 00ZZ ZZxx xxxx low_bytes := low_bytes.bit_and (0x3FF) -- 0..0 | 0000 00yy yyyy yyyy new_code := high_bytes.bit_shift_left (10) + low_bytes -- 0..0 0000 ZZZZ | xxxx xxyy yyyy yyyy new_code := new_code + 0x10000 -- 0..0 000z zzzz | xxxx xxyy yyyy yyyy -- ZZZZ + 1 = zzzzz end Result := new_code.to_character_32 ensure then is_valid_code: Result.is_valid_code end wide_character_to_encoded (c: WIDE_CHARACTER) : ARRAY[NATURAL_8] is -- convert wide_character `c' to a utf16le 16bit array -- -- You will notice that the 4 byte encoding scheme for -- characters in the range U+010000 to U+10FFFF produces -- two 16 bit words, in the range 0xD800 to 0xDFFF. -- You might wonder what happens to Unicode characters -- in the range U+D800 to U+DFFF? In fact, Unicode doesn't -- use this range, it is reserved, precisely to allow UTF-16 to work. -- http://www.tenminutetutor.com/index.php?article=utf16 local code, high_bytes, low_bytes: NATURAL_32 b1,b2,b3,b4: NATURAL_8 do code := c.natural_32_code if code > 0xFFFF then -- 0xFFFF = 1111 1111 1111 1111 create result.make (1, 4) code := code - 0x10000 -- 0x10000=10000 0000 0000 0000 high_bytes := code.bit_shift_right (10).bit_and (0x3FF).bit_or (high_surrogate) -- 16 bit High-Surrogate (U+D800 ... U+DBFF) low_bytes := code.bit_and (0x3FF).bit_or (low_surrogate) -- 16 bit Low-Surrogate (U+DC00 ... U+DFFF) b1 := high_bytes.bit_shift_right (8).to_natural_8 -- take the second 8 bits b2 := high_bytes.to_natural_8 -- take the first 8 bits b3 := low_bytes.bit_shift_right (8).to_natural_8 -- take the second 8 bits b4 := low_bytes.to_natural_8 -- take the first 8 bits result.put (b1,2) result.put (b2,1) result.put (b3,4) result.put (b4,3) else create result.make (1,2) result.put (code.bit_shift_right (8).as_natural_8, 2) -- take the second 8 bits result.put (code.as_natural_8, 1) -- take the first 8 bits end ensure then is_valid_encoded: is_valid_encoded_array_as_character (result) end feature -- character helpfeatures high_surrogate: NATURAL_32 is 0xD800 low_surrogate: NATURAL_32 is 0xDC00 feature -- string converter string_32_to_encoded (s: STRING_32) : ARRAY[NATURAL_8] is -- convert string_32 `s' to a utf16le 16bit array local i,j,my_counter: INTEGER utf16_char: ARRAY[NATURAL_8] do my_counter := 0 -- intern counter for the previous position in 'result' from i := 1 create Result.make (1, s.count) until i > s.count loop from utf16_char := wide_character_to_encoded (s.item (i)) j:=1 until j > utf16_char.count loop my_counter := my_counter + 1 Result.force (utf16_char.item (j), my_counter) j := j+1 end i := i + 1 end Result.conservative_resize (1,my_counter) -- evtl. not neccessary end encoded_to_string_32 (b: ARRAY[NATURAL_8]) : STRING_32 is -- convert array of utf16le 2x2 bytes `b' to a string_32 local i: INTEGER high_bytes, low_bytes, new_code: NATURAL_32 do create Result.make (b.count//2) -- maybe this is better than just taking the whole length from i:=b.lower until i > b.upper loop if b.item (i+1).bit_and (0xFC) = 0xD8 then -- This the second byte of 2x16bit UTF16 character high_bytes := b.item (i+1).to_natural_32.bit_shift_left (8) + b.item (i).to_natural_32 -- 0..0 | 1101 10ZZ ZZxx xxxx low_bytes := b.item (i+3).to_natural_32.bit_shift_left (8) + b.item (i+2).to_natural_32 -- 0..0 | 1101 11yy yyyy yyyy high_bytes := high_bytes.bit_and (0x3FF) -- 0..0 | 0000 00ZZ ZZxx xxxx low_bytes := low_bytes.bit_and (0x3FF) -- 0..0 | 0000 00yy yyyy yyyy new_code := high_bytes.bit_shift_left (10) + low_bytes -- 0..0 0000 ZZZZ | xxxx xxyy yyyy yyyy new_code := new_code + 0x10000 -- 0..0 000z zzzz | xxxx xxyy yyyy yyyy -- ZZZZ + 1 = zzzzz Result.append_character (new_code.to_character_32) i := i + 4 else new_code := b.item (i+1).to_natural_32.bit_shift_left (8) + b.item (i).to_natural_32 Result.append_character (new_code.to_character_32) i := i + 2 end end end feature -- check states is_valid_encoded_array_as_character (a: ARRAY[NATURAL_8]): BOOLEAN is -- is the array as utf16le encoded? local code: NATURAL_16 do if a.count = 2 then code := (a.item (a.lower + 1).to_natural_16.bit_shift_left (8) + a.item (a.lower).to_natural_16) Result := code < 0xD800 or code > 0xDFFF -- Unicode doesn't use the range between U+D800 and U+DFFF -- It is reserved, precisely to allow UTF-16 to work. elseif a.count = 4 then -- a1|a2: 1101 10ZZ | ZZxx xxxx -- a3|a4: 1101 11yy | yyyy yyyy Result := (a.item (a.lower + 1).bit_and (0xFC) = 0xD8) and (a.item (a.lower + 3).bit_and (0xFC) = 0xDC) else Result := False end end is_valid_encoded_array_as_string (a: ARRAY[NATURAL_8]): BOOLEAN is -- is the array utf16le encoded as a string local code,i: INTEGER head: NATURAL_8 failure: BOOLEAN do if failure = False then -- catches out of bound error from i:=a.lower Result := True until i > a.upper or Result = False loop head := a.item(i+1) if head.bit_and (0xFC) = 0xD8 then -- 2x 16bit character Result := Result and a.item (i+3).bit_and (0xFC) = 0xDC --third byte i := i+4 else -- 1x 16bit character code := (a.item (i+1).to_natural_16.bit_shift_left (8) + a.item (i).to_natural_16) if (code < 0xD800 or code > 0xDFFF) then Result := Result and True i := i + 2 else Result := False end end end -- loop end else Result := False end rescue failure := True retry end is_valid_char_to_encode (c: WIDE_CHARACTER): BOOLEAN is -- is 's' a valid string to encode do Result := c.code < 0x100000 -- UTF16 allows just 20bits to be encoded end is_valid_string_to_encode (s: STRING_32): BOOLEAN is -- is 's' a valid string to encode do Result := True end end