note description: "[ Encoding conversion implementation on Unix. The cache is never freed in the library. It relies on the normal termination of the client process. ]" legal: "See notice at end of class." status: "See notice at end of class." date: "$Date$" revision: "$Revision$" class ENCODING_IMP inherit ENCODING_I CODE_SETS export {NONE} all end EXCEPTION_MANAGER_FACTORY export {NONE} all end feature -- String encoding convertion convert_to (a_from_code_page: STRING; a_from_string: READABLE_STRING_GENERAL; a_to_code_page: STRING) -- Convert `a_from_string' of `a_from_code_page' to a string of `a_to_code_page'. local l_managed_pointer: MANAGED_POINTER l_count: INTEGER l_pointer: POINTER l_out_count: INTEGER l_string_32: STRING_32 l_big_endian: BOOLEAN l_error: INTEGER l_retried: BOOLEAN l_converted: READABLE_STRING_GENERAL l_exception: detachable EXCEPTION do if not l_retried then l_big_endian := is_big_endian_code_page (a_from_code_page) or else (not is_little_endian and not is_little_endian_code_page (a_from_code_page)) if is_four_byte_code_page (a_from_code_page) then l_string_32 := a_from_string.as_string_32.twin if not descriptor_cache.converted (a_from_code_page, a_to_code_page) then if (l_big_endian xor is_little_endian) then l_string_32.precede (byte_order_mark) else l_string_32.precede (byte_order_mark_32_reverse) end end l_managed_pointer := string_32_to_pointer (l_string_32) l_count := (l_string_32.count) * 4 elseif is_two_byte_code_page (a_from_code_page) then l_string_32 := a_from_string.as_string_32.twin if not descriptor_cache.converted (a_from_code_page, a_to_code_page) then if (l_big_endian xor is_little_endian) then l_string_32.precede (byte_order_mark) else l_string_32.precede (byte_order_mark_16_reverse) end end l_managed_pointer := wide_string_to_pointer (l_string_32) l_count := (l_string_32.count) * 2 else l_managed_pointer := multi_byte_to_pointer (a_from_string.as_string_8) l_count := a_from_string.count end l_pointer := iconv_imp (a_from_code_page, a_to_code_page, l_managed_pointer.item, l_count, $l_out_count, $l_error) if l_error = 0 and l_pointer /= default_pointer then last_conversion_successful := True if is_four_byte_code_page (a_to_code_page) then l_string_32 := pointer_to_string_32 (l_pointer, l_out_count) if not l_string_32.is_empty then if same_endian (l_string_32.code (1)) then l_string_32 := l_string_32.substring (2, l_string_32.count) if (is_big_endian_code_page (a_to_code_page) and is_little_endian) or else (is_little_endian_code_page (a_to_code_page) and not is_little_endian) then l_string_32 := string_32_switch_endian (l_string_32) end elseif reverse_endian (l_string_32.code (1)) then l_string_32 := l_string_32.substring (2, l_string_32.count) if (is_little_endian_code_page (a_to_code_page) and is_little_endian) or else (is_big_endian_code_page (a_to_code_page) and not is_little_endian) or else not is_endianness_specified (a_to_code_page) then l_string_32 := string_32_switch_endian (l_string_32) end end end l_converted := l_string_32 elseif is_two_byte_code_page (a_to_code_page) then l_string_32 := pointer_to_wide_string (l_pointer, l_out_count) if not l_string_32.is_empty then if same_endian (l_string_32.code (1)) then l_string_32 := l_string_32.substring (2, l_string_32.count) if (is_big_endian_code_page (a_to_code_page) and is_little_endian) or else (is_little_endian_code_page (a_to_code_page) and not is_little_endian) then l_string_32 := string_16_switch_endian (l_string_32) end elseif reverse_endian (l_string_32.code (1)) then l_string_32 := l_string_32.substring (2, l_string_32.count) if (is_little_endian_code_page (a_to_code_page) and is_little_endian) or else (is_big_endian_code_page (a_to_code_page) and not is_little_endian) or else not is_endianness_specified (a_to_code_page) then l_string_32 := string_16_switch_endian (l_string_32) end end end last_was_wide_string := True l_converted := l_string_32 else l_converted := pointer_to_multi_byte (l_pointer, l_out_count) end last_converted_string := l_converted else last_conversion_successful := False end -- Even in the case of an error `l_pointer' might not be NULL. if l_pointer /= Void then l_pointer.memory_free end end rescue l_retried := True if l_pointer /= Void then l_pointer.memory_free end l_exception := exception_manager.last_exception if l_exception /= Void and then attached {CONVERSION_FAILURE} l_exception.original as l_failure then -- In the future, a proper mechanism should be worked out -- to reflect such internal errors. For now the rescue -- is mostly for debugging. retry end end feature -- Status report is_code_page_valid (a_code_page: STRING): BOOLEAN -- Is `a_code_page' valid? -- We don't care this on Unix. What we are really interested is `is_code_page_convertable'. do if a_code_page /= Void and then not a_code_page.is_empty then Result := is_known_code_page (a_code_page.as_lower) end end is_code_page_convertable (a_from_code_page, a_to_code_page: STRING_8): BOOLEAN -- Is `a_from_code_page' convertable to `a_to_code_page'. local l_error: INTEGER l_retried: BOOLEAN do if not l_retried then Result := is_codeset_convertable (a_from_code_page, a_to_code_page, $l_error) if l_error /= 0 then conversion_exception (l_error).raise end end rescue -- In the future, a proper mechanism should be worked out -- to reflect such internal errors. For now the rescue -- is mostly for debugging. Result := False l_retried := True retry end last_conversion_lost_data: BOOLEAN -- Did last conversion lose data? -- | When there is data lose, internal exception is raised -- | and handled. The conversion simply fails. do Result := not last_conversion_successful end feature {NONE} -- Status report is_known_code_page (a_code_page: STRING): BOOLEAN -- Is `a_code_page' a known code page? require a_code_page_not_void: a_code_page /= Void a_code_page_not_empty: not a_code_page.is_empty local l_error: INTEGER l_retried: BOOLEAN do if not l_retried then if not a_code_page.is_case_insensitive_equal ({CODE_PAGE_CONSTANTS}.utf8) then Result := c_codeset_valid (a_code_page, $l_error) if l_error /= 0 then conversion_exception (l_error).raise end else Result := True end end rescue -- In the future, a proper mechanism should be worked out -- to reflect such internal errors. For now the rescue -- is mostly for debugging. Result := False l_retried := True retry end is_two_byte_code_page (a_code_page: STRING): BOOLEAN -- Is `a_code_page' a known code page? require a_code_page_not_void: a_code_page /= Void a_code_page_not_empty: not a_code_page.is_empty do Result := two_byte_code_pages.has (a_code_page.as_lower) end is_four_byte_code_page (a_code_page: STRING): BOOLEAN -- Is `a_code_page' a known code page? require a_code_page_not_void: a_code_page /= Void a_code_page_not_empty: not a_code_page.is_empty do Result := four_byte_code_pages.has (a_code_page.as_lower) end is_big_endian_code_page (a_code_page: STRING): BOOLEAN -- Is `a_code_page' a known code page? require a_code_page_not_void: a_code_page /= Void a_code_page_not_empty: not a_code_page.is_empty do Result := big_endian_code_pages.has (a_code_page.as_lower) end is_little_endian_code_page (a_code_page: STRING): BOOLEAN -- Is `a_code_page' a known code page? require a_code_page_not_void: a_code_page /= Void a_code_page_not_empty: not a_code_page.is_empty do Result := little_endian_code_pages.has (a_code_page.as_lower) end is_endianness_specified (a_code_page: STRING): BOOLEAN -- Is `a_code_page' endianness specified? require a_code_page_not_void: a_code_page /= Void a_code_page_not_empty: not a_code_page.is_empty do Result := is_big_endian_code_page (a_code_page) or else is_little_endian_code_page (a_code_page) end feature {NONE} -- Cache descriptor_cache: DESCRIPTOR_CACHE -- Cache once create Result.make end feature {NONE} -- Implementation iconv_imp (a_from_code_page, a_to_code_page: STRING; a_str: POINTER; a_size: INTEGER; a_out_count, a_error_code: TYPED_POINTER [INTEGER]): POINTER -- `iconv' plus setup and caching. require a_from_code_page_valid: is_code_page_valid (a_from_code_page) a_to_code_page_valid: is_code_page_valid (a_to_code_page) code_page_convertable: is_code_page_convertable (a_from_code_page, a_to_code_page) local l_key: STRING l_cd: POINTER do l_key := a_from_code_page + a_to_code_page descriptor_cache.search (l_key) check found: descriptor_cache.found end l_cd := descriptor_cache.found_item Result := c_iconv (l_cd, a_str, a_size, a_out_count, a_error_code) descriptor_cache.record_converted_pair (a_from_code_page, a_to_code_page) end is_codeset_convertable (a_from_code_page, a_to_code_page: STRING; a_error: TYPED_POINTER [INTEGER]): BOOLEAN -- Is `a_from_codeset' and `a_to_codeset' convertable? local l_fp, l_tp: MANAGED_POINTER l_key: STRING l_cd: POINTER l_succ: BOOLEAN do l_key := a_from_code_page + a_to_code_page descriptor_cache.search (l_key) if descriptor_cache.found then Result := True else l_fp := multi_byte_to_pointer (a_from_code_page) l_tp := multi_byte_to_pointer (a_to_code_page) l_cd := c_iconv_open (l_fp.item, l_tp.item, a_error, $l_succ) if l_succ then descriptor_cache.put (l_cd, l_key) Result := True end end end c_codeset_valid (a_code_set: STRING; a_error: TYPED_POINTER [INTEGER]): BOOLEAN -- Check if `a_code_set' is convertible to utf-8 to see if it is valid. -- Some systems do not support utf-8 to utf-8 conversion, so checking utf-8 -- should be avoided. do Result := is_codeset_convertable (a_code_set, "utf-8", a_error) end same_endian (code: NATURAL_32): BOOLEAN -- The same endianness with the sys? do Result := code = 0xFEFF end reverse_endian (code: NATURAL_32): BOOLEAN -- Reverse endianness with the sys? do Result := code = 0xFFFE or code = 0xFFFE0000 end string_32_to_pointer (a_string: STRING_32): MANAGED_POINTER require a_string_not_void: a_string /= Void local i, nb: INTEGER new_size: INTEGER l_end_pos, l_start_pos: INTEGER l_managed_data: MANAGED_POINTER do l_start_pos := 1 l_end_pos := a_string.count create l_managed_data.make ((l_end_pos + 1) * 4) nb := l_end_pos - l_start_pos + 1 new_size := (nb + 1) * 4 if l_managed_data.count < new_size then l_managed_data.resize (new_size) end from i := 0 until i = nb loop l_managed_data.put_natural_32 (a_string.code (i + l_start_pos), i * 4) i := i + 1 end l_managed_data.put_natural_32 (0, i * 4) Result := l_managed_data end byte_order_mark: CHARACTER_32 -- Byte order mark (BOM) once Result := (0xFEFF).to_character_32 end byte_order_mark_32_reverse: CHARACTER_32 -- Byte order mark (BOM) once Result := (0xFFFE0000).to_character_32 end byte_order_mark_16_reverse: CHARACTER_32 -- Byte order mark (BOM) once Result := (0xFFFE).to_character_32 end conversion_exception (a_error:INTEGER): CONVERSION_FAILURE -- Create exception by `a_error' do inspect a_error when 1 then create Result.make_message ("`malloc' error") when 2 then create Result.make_message ("`realloc' error") when 3 then create Result.make_message ("`iconv_open' error") when 4 then create Result.make_message ("EILSEQ error in `iconv'. Input conversion stopped due to an input byte that does not belong to the input codeset.") when 5 then create Result.make_message ("EINVAL error in `iconv'. Input conversion stopped due to an incomplete character or shift sequence at the end of the input buffer.") when 6 then create Result.make_message ("EBADF error in `iconv'. The cd argument is not a valid open conversion descriptor.") when 7 then create Result.make_message ("Unexpected error in `iconv'") when 8 then create Result.make_message ("`iconv_close' error") else create Result.make_message ("Unexpected error") end ensure conversion_exception_not_void: Result /= Void end c_iconv_open (a_from_codeset, a_to_codeset: POINTER; a_b: TYPED_POINTER [INTEGER]; a_succ: TYPED_POINTER [BOOLEAN]): POINTER -- Open a descriptor external "C inline use " alias "[ iconv_t cd; cd = iconv_open ($a_to_codeset, $a_from_codeset); if (cd == (iconv_t)(-1)) { *$a_b = 3; return NULL; } *$a_succ = EIF_TRUE; return cd; ]" end c_iconv (a_cd: POINTER; a_str: POINTER; a_size: INTEGER; a_out_count, a_error_code: TYPED_POINTER [INTEGER]): POINTER -- Code `a_error_code' could be set when error occurs. -- See `conversion_exception' for the meaning. external "C inline use " alias "[ size_t insize = 0; iconv_t cd = (iconv_t) $a_cd; size_t nconv, avail, alloc; char *res, *tres, *wrptr, *inptr; char **l_inptr = &inptr; insize = (size_t)$a_size; alloc = avail = insize + insize/4; if (!(res = malloc(alloc))) { *$a_error_code = 1; return NULL; } else { *$a_error_code = 0; wrptr = res; /* duplicate pointers because they */ inptr = $a_str; /* get modified by iconv */ /* Reset the descriptor to intial state. */ iconv (cd, NULL, 0, NULL, 0); do { nconv = iconv (cd, l_inptr, &insize, &wrptr, &avail); /*convertions */ if (nconv == (size_t)(-1)) { if (errno == E2BIG) { /* need more room for result */ tres = realloc(res, alloc += 20); avail += 20; if (!tres) { *$a_error_code = 2; break; } wrptr = tres + (wrptr - res); res = tres; } else if (errno == EILSEQ) { *$a_error_code = 4; break; } else if (errno == EINVAL){ *$a_error_code = 5; break; } else if (errno == EBADF){ *$a_error_code = 6; break; } else{ *$a_error_code = 7; break; } } } while (insize); *$a_out_count = alloc - avail; return res; } ]" end note library: "Encoding: Library of reusable components for Eiffel." copyright: "Copyright (c) 1984-2010, Eiffel Software and others" license: "Eiffel Forum License v2 (see http://www.eiffel.com/licensing/forum.txt)" source: "[ Eiffel Software 5949 Hollister Ave., Goleta, CA 93117 USA Telephone 805-685-1006, Fax 805-685-6869 Website http://www.eiffel.com Customer support http://support.eiffel.com ]" end