note
	description: "[
					Interface of encoding converter with encoding detection.
					
					Encoding detection priority:
					1. If `a_encoding' is attached, use it as the encoding of `a_file'
					2. Detect BOM from `a_file', use the detected encoding.
					4. Default to ASCII (ISO-8859-1) encoding.
				]"
	status: "See notice at end of class."
	legal: "See notice at end of class."
	date: "$Date$"
	revision: "$Revision$"

class
	ENCODING_CONVERTER

inherit
	ANY

	STRING_HANDLER
		export
			{NONE} all
		end

	SYSTEM_ENCODINGS
		export
			{NONE} all
		end

	LOCALIZED_PRINTER

	UNICODE_CONVERSION
		rename
			string_32_to_multi_byte as string_32_to_stream_encoding
		end

create
	make

feature {NONE} -- Initialization

	make
			-- <precursor>
		do
			create string_buffer.make (50000)
		end

feature -- Buffer

	input_buffer_from_file (a_file: KL_BINARY_INPUT_FILE; a_class: detachable ANY): detachable YY_UNICODE_BUFFER
			-- Fetch the input buffer according to the content of `a_file'.
			-- Set `detected_encoding' and `last_bom' accordingly.
		require
			a_file_not_void: a_file /= Void
			a_file_open_read: a_file.is_open_read
		local
			l_buffer: ENCODING_DETECTION_FILE_BUFFER
			l_ascii_buffer: like ascii_to_utf8_file_buffer
			l_string: STRING
		do
			detected_encoding := Void
			last_bom := Void
			l_buffer := detection_buffer
			l_buffer.set_file (a_file)
			l_buffer.detect_file
			if attached l_buffer.detected_encoding as l_detected_encoding then
				last_bom := l_buffer.last_bom
				detected_encoding := l_detected_encoding
				if l_detected_encoding.is_equal (utf8) then
--					l_buffer.set_default_encoding (l_buffer.utf8_encoding)
					Result := l_buffer
				else
					-- Report unknown encoding error.
				end
			else
					-- Get encoding from `a_class', if not found, use ASCII encoding.
				if attached a_class and then attached encoding_from_class (a_class) as l_encoding then
					l_string := string_buffer
					l_string.wipe_out
					l_string.append_string (l_buffer.content.to_text)
					a_file.read_string (a_file.count)
					l_string.append (a_file.last_string)
					l_encoding.convert_to (utf8, l_string)
					if l_encoding.last_conversion_successful then
						create Result.make_from_utf8_string (l_encoding.last_converted_stream)
						detected_encoding := l_encoding
					else
						-- Report unsupported encoding error.
					end
				else
					l_ascii_buffer := ascii_to_utf8_file_buffer
					l_ascii_buffer.make_from_file_buffer (l_buffer)
					l_ascii_buffer.set_default_encoding (l_ascii_buffer.iso_8859_1_encoding)
					Result := l_ascii_buffer
					detected_encoding := default_encoding
				end
			end
		end

	input_buffer_from_file_of_encoding (a_file: KL_BINARY_INPUT_FILE; a_encoding: ENCODING): detachable YY_UNICODE_BUFFER
			-- Fetch the input buffer according to the content of `a_file'.
			-- Set `detected_encoding' and `last_bom' accordingly.
		require
			a_file_not_void: a_file /= Void
			a_file_open_read: a_file.is_open_read
			a_encoding_not_void: a_encoding /= Void
		do
			a_file.read_string (a_file.count)
			a_encoding.convert_to (utf8, a_file.last_string)
			if a_encoding.last_conversion_successful then
				create Result.make_from_utf8_string (a_encoding.last_converted_stream)
			else
				-- Report unsupported encoding error.
			end
			detected_encoding := a_encoding
			last_bom := Void
		end

	input_buffer_from_ascii_string (a_string: STRING_8): YY_UNICODE_BUFFER
			-- Input buffer from ASCII string.
		require
			a_string_not_void: a_string /= Void
		do
			create Result.make_from_iso_8859_1_string (a_string)
			detected_encoding := default_encoding
			last_bom := Void
		ensure
			buffer_attached: Result /= Void
			detected_encoding_attached: detected_encoding /= Void
		end

	input_buffer_from_string_of_encoding (a_string: STRING_8; a_encoding: ENCODING): detachable YY_UNICODE_BUFFER
		require
			a_string_not_void: a_string /= Void
		do
			a_encoding.convert_to (utf8, a_string)
			if a_encoding.last_conversion_successful then
				create Result.make_from_utf8_string (a_encoding.last_converted_stream)
			end
			detected_encoding := a_encoding
			last_bom := Void
		end

	input_buffer_from_string (a_string: STRING_8; a_class: detachable ANY): detachable YY_UNICODE_BUFFER
		require
			a_string_not_void: a_string /= Void
		local
			l_encoding: ENCODING
			l_string: STRING
		do
			detected_encoding := Void
			last_bom := Void
			bom_detector.detect (a_string)
			if attached bom_detector.detected_encoding as l_detected_encoding then
				l_encoding := l_detected_encoding
				last_bom := bom_detector.last_bom
			elseif a_class /= Void and then attached encoding_from_class (a_class) as l_enc then
				l_encoding := l_enc
			else
				l_encoding := iso_8859_1
			end
			l_string := a_string
			if attached last_bom as l_bom and then not l_bom.is_empty then
				l_string := l_string.substring (l_bom.count + 1, l_string.count)
			end
			l_encoding.convert_to (utf8, l_string)
			if l_encoding.last_conversion_successful then
				create Result.make_from_utf8_string (l_encoding.last_converted_stream)
			end
			detected_encoding := l_encoding
		end

	default_encoding: ENCODING
			-- Default encoding
		do
			Result := iso_8859_1
		ensure
			Result_set: Result /= Void
		end

feature -- Conversion (based on a class)

	utf8_string (a_stream: STRING; a_class: detachable ANY): STRING
			-- Detect encoding of `a_stream' and convert it into utf8.
			-- Detection is not 100% reliable. Use other conversion methods when
			-- encodings are known.
		require
			a_stream_attached: a_stream /= Void
		do
			bom_detector.detect (a_stream)
			last_bom := Void
			if bom_detector.last_detection_successful then
				Result := a_stream.substring (bom_detector.last_bom_count + 1, a_stream.count)
				detected_encoding := bom_detector.detected_encoding
				last_bom := bom_detector.last_bom
			elseif attached a_class and then attached encoding_from_class (a_class) as l_encoding then
				l_encoding.convert_to (utf8, a_stream)
				if l_encoding.last_conversion_successful then
					Result := l_encoding.last_converted_stream
				else
						-- Report unsupported encoding error.
					Result := a_stream
				end
				detected_encoding := l_encoding
			else
					-- Default to ASCII
				iso_8859_1.convert_to (utf8, a_stream)
				if iso_8859_1.last_conversion_successful then
					Result := iso_8859_1.last_converted_stream
				else
					Result := a_stream
				end
				detected_encoding := default_encoding
			end
		ensure
			utf32_string_attached: Result /= Void
			detected_encoding_attached: detected_encoding /= Void
		end

	utf32_string (a_stream: STRING; a_class: detachable ANY): STRING_32
			-- Detect encoding of `a_stream' and convert it into utf32.
		require
			a_stream_attached: a_stream /= Void
		do
			bom_detector.detect (a_stream)
			last_bom := Void
			if bom_detector.last_detection_successful then
				Result := utf8_to_utf32 (a_stream.substring (bom_detector.last_bom_count + 1, a_stream.count))
				detected_encoding := bom_detector.detected_encoding
				last_bom := bom_detector.last_bom
			elseif attached a_class and then attached encoding_from_class (a_class) as l_encoding then
				l_encoding.convert_to (utf32, a_stream)
				if l_encoding.last_conversion_successful then
					Result := l_encoding.last_converted_string.as_string_32
				else
						-- Try converting to utf8 first, as some OS does not support utf32.
					l_encoding.convert_to (utf8, a_stream)
					if l_encoding.last_conversion_successful then
						Result := utf8_to_utf32 (l_encoding.last_converted_stream)
					else
							-- Report unsupported encoding error.
						Result := a_stream.as_string_32
					end
				end
				detected_encoding := l_encoding
			else
					-- Default to ASCII
				iso_8859_1.convert_to (utf32, a_stream)
				if iso_8859_1.last_conversion_successful then
					Result := iso_8859_1.last_converted_string_32
				else
						-- Try converting to utf8 as some OS does not support UTF-32 conversion.
					iso_8859_1.convert_to (utf8, a_stream)
					if iso_8859_1.last_conversion_successful then
						Result := utf8_to_utf32 (iso_8859_1.last_converted_stream)
					else
						Result := a_stream.as_string_32
					end
				end
				detected_encoding := iso_8859_1
			end
		ensure
			utf32_string_attached: Result /= Void
			detected_encoding_attached: detected_encoding /= Void
		end

feature -- Conversion

	utf8_to_utf16 (a_string: STRING_8): STRING_32
			-- UTF8 to UTF16 conversion, Eiffel implementation.
			-- |FIXME: UTF-8 to UTF-16 is not implemented.
			-- |FIXME: But in most case, UTF-32 is the same with UTF-16.
		require
			a_string_not_void: a_string /= Void
		do
			Result := utf8_to_utf32 (a_string)
		ensure
			Result_not_void: Result /= Void
		end

	utf32_to_file_encoding (a_str: STRING_32): STRING
			-- Convert utf32 to file encoding (utf8 as default)
		require
			a_str_attached: a_str /= Void
		do
			Result := utf32_to_utf8 (a_str)
		ensure
			utf32_to_console_encoding_attached: Result /= Void
		end

	string_32_to_stream (a_str: STRING_32): STRING
			-- Byte stream of `a_string'.
			-- Always generate little endian string.
		require
			a_str_attached: a_str /= Void
		local
			i: INTEGER_32
			l_code: NATURAL_32
			l_count: INTEGER
		do
			l_count := a_str.count
			if l_count > 0 then
				create Result.make (l_count * 4)
				from
					i := 1
				until
					i > l_count
				loop
					l_code := a_str [i].natural_32_code
					Result.append_code (l_code & 0x000000FF)
					Result.append_code (l_code & 0x0000FF00 |>> 8)
					Result.append_code (l_code & 0x00FF0000 |>> 16)
					Result.append_code (l_code & 0xFF000000 |>> 24)
					i := i + 1
				end
			else
				create Result.make_empty
			end
		ensure
			string_32_to_stream_attached: Result /= Void
		end

feature -- Validate

	is_code_point_valid_string_8 (a_utf8_str: STRING_8): BOOLEAN
			-- Is Unicode code point  of `a_utf_8_str' valid for STRING_8?
		require
			a_string_not_void: a_utf8_str /= Void
		local
			i, nb: INTEGER
			l_ref: INTEGER_32_REF
		do
			from
				i := 1
				nb := a_utf8_str.count
				create l_ref
				Result := True
			until
				i > nb or not Result
			loop
				if not read_character_from_utf8 (i, l_ref, a_utf8_str).is_character_8 then
					Result := False
				end
				i := i + l_ref.item
			end
		end

feature -- Detection

	encoding_from_string_of_class (a_string: STRING_8; a_class: detachable ANY): detachable ENCODING
			-- Encoding detected from `a_string' of `a_class'.
		do
			last_bom := Void
			Bom_detector.detect (a_string)
			if Bom_detector.last_detection_successful then
				Result := bom_detector.detected_encoding
				last_bom := bom_detector.last_bom
			elseif attached a_class and then attached encoding_from_class (a_class) as l_encoding then
				Result := l_encoding
			else
				Result := iso_8859_1
			end
		end

	last_bom: detachable STRING_8
			-- Last bom read from `encoding_from_string_of_class'

	detected_encoding: detachable ENCODING assign set_detected_encoding
			-- Detected encoding

feature -- Element Change

	set_detected_encoding (a_encoding: like detected_encoding)
			-- Sets the detected encoding
		require
			a_encoding_not_void: a_encoding /= Void
		do
			detected_encoding := a_encoding
		ensure
			detected_encoding_set: detected_encoding = a_encoding
		end

feature {NONE} -- Implementation

	encoding_from_class (a_class: ANY): detachable ENCODING
			-- Read encoding from `a_class'.
		do
		end

feature {NONE} -- Buffers

	string_buffer: STRING
			-- String buffer

	bom_detector: BOM_ENCODING_DETECTOR
			-- Bom detector
		once
			create Result
		end

	Ascii_to_utf8_File_buffer: ASCII_UTF8_CONVERSION_FILE_BUFFER
			-- On the fly ASCII UTF-8 conversion buffer.
		once
			create Result.make_with_size (create {KL_STRING_INPUT_STREAM}.make (""), 50000)
		ensure
			file_buffer_not_void: Result /= Void
		end

	detection_buffer: ENCODING_DETECTION_FILE_BUFFER
			-- Buffer for encoding detection
		once
			create Result.make_with_size (create {KL_STRING_INPUT_STREAM}.make (""), 50000)
		ensure
			file_buffer_not_void: Result /= Void
		end

invariant
	string_buffer_not_void: string_buffer /= Void

note
	copyright: "Copyright (c) 1984-2020, Eiffel Software"
	license:   "GPL version 2 (see http://www.eiffel.com/licensing/gpl.txt)"
	licensing_options: "http://www.eiffel.com/licensing"
	copying: "[
			This file is part of Eiffel Software's Eiffel Development Environment.
			
			Eiffel Software's Eiffel Development Environment is free
			software; you can redistribute it and/or modify it under
			the terms of the GNU General Public License as published
			by the Free Software Foundation, version 2 of the License
			(available at the URL listed under "license" above).
			
			Eiffel Software's Eiffel Development Environment is
			distributed in the hope that it will be useful, but
			WITHOUT ANY WARRANTY; without even the implied warranty
			of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			See the GNU General Public License for more details.
			
			You should have received a copy of the GNU General Public
			License along with Eiffel Software's Eiffel Development
			Environment; if not, write to the Free Software Foundation,
			Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
		]"
	source: "[
			Eiffel Software
			5949 Hollister Ave., Goleta, CA 93117 USA
			Telephone 805-685-1006, Fax 805-685-6869
			Website http://www.eiffel.com
			Customer support http://support.eiffel.com
		]"

end