note
	description: "Scanner class for processing a JSON encoded string. Does not handle JSON `null' values"
	author: "B. Herlig, B. Schoeller"
	date: "$Date$"
	revision: "$Revision$"

class
	A_JSON_TOKENIZER

create
	make

feature -- Initialization

	make (an_input: STRING)
			-- Creates the tokenizer on a JSON encoded `an_input' string.
		require
			input_specified: an_input /= Void
		do
			input := an_input
			char_pos := 1
			has_errors := False
			error_position := 0
			set_current_char
		ensure
			input_set: input /= Void and then input = an_input
			char_pos_initialized: char_pos = 1
			error_flag_initialized: has_errors = False
			error_position_initialized: error_position = 0
		end

feature -- Implementation (Parser)

	expect_token (a_token: INTEGER)
			-- Expect `a_token' as current token, otherwise report an error.
		do
			if last_token /= a_token then
				record_error
			end
			read_token
		ensure
			wrong_expected_token_is_error: (old last_token /= a_token) implies has_errors
		end

feature -- Access

	last_token: INTEGER
		-- Last token read

	has_errors: BOOLEAN
		-- Indicates if the last parsing did have errors

	error_position: INTEGER
		-- Position of the first error encountered

		-- Last read & decoded primitive values
	last_double: DOUBLE
	last_integer: INTEGER_64
	last_string: STRING
	last_boolean: BOOLEAN

	last_identifier: STRING
		-- Last keyword read. As "null" is not handeld,
		--   this will only denotes boolean keywords

feature -- Errorhandling

	record_error
			-- Record an error in parsing.
		do
			if not has_errors then
				has_errors := True
				error_position := char_pos
			end
		ensure
			error_set: has_errors
			error_position_updated_on_first_error: old has_errors xor (error_position = char_pos)
		end

feature -- Implementation (Scanner: Character features)

	input: STRING
		-- Input operating on

	char_pos: INTEGER
		-- Current reading cursor position (characters)

	current_char: CHARACTER
			-- Current character
	next_char
			-- Move the cursor to the next char.
		do
			char_pos := char_pos + 1
			set_current_char
		ensure
			moved_right: char_pos = old char_pos + 1
		end

	read_token
			-- Read a token, indicate it's type in `last_token'
		do
			skip_white_space

			if current_char = '%U' then
				last_token := token_eof
				next_char
			elseif current_char = '{' then
				last_token := token_open_curly_brace
				next_char
			elseif current_char = '}' then
				last_token := token_closed_curly_brace
				next_char
			elseif current_char = '[' then
				last_token := token_open_bracket
				next_char
			elseif current_char = ']' then
				last_token := token_closed_bracket
				next_char
			elseif current_char = ',' then
				last_token := token_comma
				next_char
			elseif current_char = ':' then
				last_token := token_colon
				next_char
			elseif current_char = '.' then
				last_token := token_dot
				next_char
			elseif current_char = '"' then
				read_string
			elseif current_char.is_alpha then
				read_identifier
			elseif current_char.is_digit or current_char = '-' then
				read_number
			else
				last_token := token_unknown
				record_error
				next_char
			end
		end

	skip_white_space
			-- Skip all white-space.
		do
			from
			until
				char_pos > input.count or else not current_char.is_space
			loop
				next_char
			end
		ensure
			at_EOF_or_not_on_whitespace: char_pos > input.count or else not current_char.is_space
		end

	read_identifier
			-- Read an identifier.
		do
			last_token := token_identifier
			last_identifier := ""

			from
			until
				not (current_char.is_alpha) or char_pos > input.count
			loop
				last_identifier.extend (current_char)
				next_char
			end

			if last_identifier.is_equal ("true") then
				last_boolean := True
			elseif last_identifier.is_equal ("false") then
				last_boolean := False
			else
				-- Unknown identifier read
				record_error
			end
		end

	read_number
			-- Read a number (integer or double)
		local
			start_pos: INTEGER
			is_double: BOOLEAN
		do
			start_pos := char_pos
			is_double := False
			if current_char = '-' then
				next_char
			end

			from
			until
				(not current_char.is_digit) and
				(current_char /= '.') and
				(current_char /= 'e') and
				(current_char /= 'E') and
				(current_char /= '-') and
				(current_char /= '+')
				or
				(char_pos > input.count)
			loop
				if current_char = '.' or current_char = 'e' or current_char = 'E' then
					is_double := True
				end
				next_char
			end

			if is_double then
				last_token := token_double
				last_double := input.substring (start_pos, char_pos - 1).to_double
			else
				last_token := token_integer
				last_integer := input.substring (start_pos, char_pos - 1).to_integer_64
			end
		end

	read_string
			-- Read a string and make it available in `last_string'
		local
			code_point: STRING
			code: INTEGER
			utf8_str: UC_UTF8_STRING
		do
			last_token := token_string
			last_string := ""
			-- skip opening quote-symbol
			next_char

			from
			until
				(current_char = '%"') or
				has_errors or
				(char_pos > input.count)
			loop
				if current_char = '\' then
					next_char
					if current_char = '%"' then
						last_string.extend ('%"')
					elseif current_char = '\' then
						-- Single backslashes do not need to be escaped in Eiffel
						last_string.extend ('\')
					elseif current_char = '/' then
						-- Neither do forward slashes
						last_string.extend ('/')
					elseif current_char = 'b' then
						last_string.extend ('%B')
					elseif current_char = 'f' then
						last_string.extend ('%F')
					elseif current_char = 'n' then
						last_string.extend ('%N')
					elseif current_char = 'r' then
						last_string.extend ('%R')
					elseif current_char = 't' then
						last_string.extend ('%T')
					elseif current_char = 'u' then
						-- Unicode:
						-- Read the next four hex-characters and create a unicode character from this code.
						create code_point.make (4)
						next_char
						code_point.append_character (current_char)
						next_char
						code_point.append_character (current_char)
						next_char
						code_point.append_character (current_char)
						next_char
						code_point.append_character (current_char)

						-- Check if it's a valid hex-code
						if string_routines.is_hexadecimal (code_point) then
							-- Convert to integer, and create a UTF-8 string from this codepoint
							code := string_routines.hexadecimal_to_integer (code_point)

							create utf8_str.make_empty
							utf8_str.append_unicode_character (create {UC_CHARACTER}.make_from_code (code))
							last_string.append (utf8_str.to_utf8)
						else
							record_error
						end
					else
						last_string.extend (current_char)
					end
				else
					last_string.extend (current_char)
				end
				next_char
			end

			-- skip closing quote-symbol
			next_char
		end


feature -- Implementation (Scanner: Tokens)

	frozen token_eof: INTEGER = 0					-- %U
	frozen token_identifier: INTEGER = 1
	frozen token_integer: INTEGER = 2
	frozen token_double: INTEGER = 3
	frozen token_string: INTEGER = 4
	frozen token_boolean: INTEGER = 5
	frozen token_open_curly_brace: INTEGER = 6		-- {
	frozen token_closed_curly_brace: INTEGER = 7	-- }
	frozen token_open_bracket: INTEGER = 8			-- [
	frozen token_closed_bracket: INTEGER = 9		-- ]
	frozen token_comma: INTEGER = 10				-- ,
	frozen token_colon: INTEGER = 11				-- :
	frozen token_dot: INTEGER = 12					-- .
	frozen token_quote: INTEGER = 13				-- "
	frozen token_unknown: INTEGER = -1


feature {NONE} -- Implementation

	string_routines: KL_STRING_ROUTINES
			-- Access to Gobo's string routines,
			-- Used for checking & converting hex-strings
		once
			create Result
		ensure
			created: Result /= Void
		end

	set_current_char
			-- Update the value of 'current_char'
		do
			if char_pos > input.count then
				current_char := '%U'
			else
				current_char := input.item (char_pos)
			end
		end
		
invariant
	input_specified: input /= Void
	character_position_positive: char_pos > 0
	error_implies_error_position_set: has_errors implies error_position /= 0
end