note

	description: "Lexical analyzers."
	legal: "See notice at end of class."
	status: "See notice at end of class."
	date: "$Date$"
	revision: "$Revision$"

class LEXICAL inherit

	TEXT_FILLER

create

	make, make_new

feature {NONE} -- Initialization

	make
			-- Set up lexical analyzer for retrieval.
		do
			create last_token
			create categories_table.make_empty
			create dfa.make (1, 0)
			create buffer.make_empty
			create line_nb_array.make_empty
			create column_nb_array.make_empty
			create keyword_h_table.make (0)
		end;

	make_new
			-- Set up a new lexical analyzer
		obsolete
			"Use make instead. [2017-05-31]"
		do
			make
			initialize
		end

feature -- Access

	last_token: TOKEN;
			-- Last token read

	token_line_number: INTEGER
			-- Line number of last token read
		do
			Result := line_nb_array.item (token_start)
		ensure
			Result >= 1
		end;

	token_column_number: INTEGER
			-- Column number of last token read
		do
			Result := column_nb_array.item (token_start)
		ensure
			Result >= 1
		end;

	last_string_read: STRING
			-- String value of last token read
		do
				-- Create a new string at each call
			Result := buffer.substring (token_start, token_end)
		end;

	keyword_code (word: STRING): INTEGER
			-- Keyword code for `word'.
			-- -1 if not a keyword.
		require
			word_not_void: word /= Void
		local
			l_lower_word: like lower_word
		do
			if keywords_case_sensitive then
				if keyword_h_table.has (word) then
					Result := word.hash_code
				else
					Result := -1
				end
			else
				l_lower_word := word.as_lower
				lower_word := l_lower_word
				if keyword_h_table.has (l_lower_word) then
					Result := l_lower_word.hash_code
				else
					Result := -1
				end
			end
		end;

	last_is_keyword: BOOLEAN
			-- Is the last read token a keyword?
		do
			Result := is_keyword (last_string_read)
		ensure
			Result = is_keyword (last_string_read)
		end;

	last_keyword_code: INTEGER
			-- Keyword code for last token.
			-- -1 if not a keyword.
		do
			Result := keyword_code (last_string_read)
		ensure
			-- Result = -1 or last_string_read is in keyword_h_table.
		end;

	last_keyword_text: detachable STRING
			-- Last read string if recognized as a keyword;
			-- void otherwise.
		do
			if last_is_keyword then
				Result := last_string_read
			end
		end

	keyword_string (n: INTEGER): STRING
			-- Keyword corresponding to keyword code `n'
		local
			finished: BOOLEAN
		do
			create Result.make_empty
			from
				keyword_h_table.start
			until
				finished or keyword_h_table.after
			loop
				finished := n = keyword_h_table.key_for_iteration.hash_code
				if finished then
					Result.append (keyword_h_table.key_for_iteration)
				end
				keyword_h_table.forth
			end
		ensure
			keyword_found: Result /= Void
		end;

	token_type: INTEGER;
			-- Type of last token read

	No_token: INTEGER = 0;
			-- Token type for no token recognized.

	other_possible_tokens: detachable ARRAY [INTEGER];
			-- Other candidate types for last recognized token

	end_of_text: BOOLEAN;
			-- Has end of input been reached?

feature -- Status setting

	set_separator_type (type : INTEGER)
			-- Set `type' to be the type of tokens
			-- used as separators.
		do
			separator_token_type := type
		ensure
			separator_token_type = type
		end;

feature -- Input

	get_token
			-- Read new token matching one of the regular
			-- expressions of the lexical grammar.
			-- Recognize longest possible string;
			-- ignore unrecognized tokens and separators.
		local
			found: BOOLEAN
		do
			from
			until
				end_of_text or found
			loop
				get_any_token;
				found := token_type /= separator_token_type and token_type /= 0
			end
		ensure
			end_of_text or
			(token_type /= separator_token_type
			and token_type /= 0)
		end;

	buffer_item_code (c: INTEGER): INTEGER
		do
			Result := buffer.item_code (c);
			if Result = 255 then
				Result := -1
			end;
		end;

	get_any_token
			-- Try to read a new token.
			-- Recognize longest possible string.
			--| Thus, when a token is recognized, this routine keeps
			--| track of its type, but goes on analyzing, until the
			--| current state has a void successor.
		require
			dfa_not_void: dfa /= Void;
			not_end_of_text: not end_of_text;
			buffers_created: buffer /= Void
		local
			state: detachable STATE_OF_DFA;
			too_big, buffer_resized: BOOLEAN;
			local_string: STRING
			l_dfa: like dfa
			l_cat_table: like categories_table
		do
			if token_end >= almost_end_of_buffer then
				fill_buffer (token_end);
				token_end := 0
			end;
			read_index := token_end + 1;
			if
				buffer_item_code (read_index) = -1
			then
				end_of_text := True;
				token_type := -1;
				token_start := token_end;
				other_possible_tokens := Void
			else
				token_type := 0;
				token_start := token_end + 1
			end;
			if read_index > buffer_size then
				if token_start = 1 then
					buffer_resized := True;
					resize_and_fill_buffer (buffer_size + Extra_buffer_size, 0)
				else
					fill_buffer (token_start - 1);
					token_end := 0
				end;
				get_any_token
			else
				from
					l_dfa := dfa
					l_cat_table := categories_table
					state := l_dfa.item (1);
					if state /= Void then
						state := state.item (l_cat_table.item
							(buffer_item_code (read_index)))
					end
				until
					state = Void or too_big
				loop
					if state.final /= 0 then
						token_type := state.final;
						other_possible_tokens := state.final_array;
						token_end := read_index
					end;
					read_index := read_index + 1;
					if read_index > buffer_size then
						too_big := True
					else
						state := state.item (l_cat_table.item
								(buffer_item_code (read_index)))
					end
				end;
				if too_big then
					if token_start = 1 then
						buffer_resized := True;
						resize_and_fill_buffer (buffer_size + Extra_buffer_size, 0)
					else
						fill_buffer (token_start - 1);
						token_end := 0
					end;
					get_any_token
				else
					if token_type = 0 then
						token_end := token_end + 1;
						read_index := read_index + 1
					end;
					local_string := buffer.substring (token_start, token_end);
					last_token.set (token_type,
							line_nb_array.item (token_start),
							column_nb_array.item (token_start),
							keyword_code (local_string),
							local_string);
					debug ("lex_output")
						io.put_string ("Last token:%N");
						io.put_string (last_token.out);
						io.put_string ("Type return:");
						io.new_line
						--io.read_character
					end
				end
			end;
			if buffer_resized then
				resize_and_fill_buffer (Standard_buffer_size, token_end);
				token_end := 0
			end
		end;

	get_short_token
			-- Read shortest token that matches one of the
			-- lexical grammar's regular expressions.
		require
			dfa_not_void: dfa /= Void;
			not_end_of_text: not end_of_text;
			buffers_created: buffer /= Void
		local
			state: detachable STATE_OF_DFA;
			too_big, recognized, buffer_resized: BOOLEAN;
			local_string: STRING
			l_dfa: like dfa
			l_cat_table: like categories_table
		do
			if token_end >= almost_end_of_buffer then
				fill_buffer (token_end);
				token_end := 0
			end;
			read_index := token_end + 1;
			if buffer_item_code (read_index) = -1 then
				end_of_text := True;
				token_type := -1;
				token_start := token_end;
				other_possible_tokens := Void
			else
				token_type := 0;
				token_start := token_end + 1
			end;
			if read_index > buffer_size then
				if token_start = 1 then
					buffer_resized := True;
					resize_and_fill_buffer (buffer_size + Extra_buffer_size, 0)
				else
					fill_buffer (token_start - 1);
					token_end := 0
				end;
				get_short_token
			else
				from
					l_dfa := dfa
					l_cat_table := categories_table
					state := l_dfa.item (1)
					if state /= Void then
						state := state.item (l_cat_table.item
								(buffer_item_code (read_index)));
					end
				until
					state = Void or recognized or too_big
				loop
					if state.final /= 0 then
						token_type := state.final;
						other_possible_tokens := state.final_array;
						token_end := read_index;
						recognized := True
					end;
					read_index := read_index + 1;
					if read_index > buffer_size then
						too_big := True
					else
						state := state.item (l_cat_table.item
									(buffer_item_code (read_index)))
					end
				end;
				if too_big then
					if token_start = 1 then
						buffer_resized := True;
						resize_and_fill_buffer (buffer_size + Extra_buffer_size, 0)
					else
						fill_buffer (token_start - 1);
						token_end := 0
					end;
					get_short_token
				else
					if token_type = 0 then
						token_end := token_end + 1;
						read_index := read_index + 1
					end;
					local_string := buffer.substring (token_start, token_end);
					last_token.set (token_type,
									line_nb_array.item (token_start),
									column_nb_array.item (token_start),
									keyword_code (local_string),
									local_string)
				end
			end;
			if buffer_resized then
				resize_and_fill_buffer (Standard_buffer_size, token_end);
				token_end := 0
			end
		end;

	get_fixed_token (l: INTEGER)
			-- Read new token that matches one of the
			-- lexical grammar's regular expressions.
			-- Recognize longest possible string with
			-- length less than or equal to `l'.
		require
			dfa_not_void: dfa /= Void;
			not_end_of_text: not end_of_text;
			buffers_created: buffer /= Void
		local
			state: detachable STATE_OF_DFA;
			too_big, buffer_resized: BOOLEAN;
			local_string: STRING
			l_dfa: like dfa
			l_cat_table: like categories_table
		do
			if token_end >= almost_end_of_buffer then
				fill_buffer (token_end);
				token_end := 0
			end;
			read_index := token_end + 1;
			if buffer_item_code (read_index) = -1 then
				end_of_text := True;
				token_type := -1;
				token_start := token_end;
				other_possible_tokens := Void
			else
				token_type := 0;
				token_start := token_end + 1
			end;
			if read_index > buffer_size then
				if token_start = 1 then
					buffer_resized := True;
					resize_and_fill_buffer (buffer_size + Extra_buffer_size, 0)
				else
					fill_buffer (token_start - 1);
					token_end := 0
				end;
				get_fixed_token (l)
			else
				from
					l_dfa := dfa
					l_cat_table := categories_table
					state := l_dfa.item (1)
					if state /= Void then
						state := state.item (l_cat_table.item
								(buffer_item_code (read_index)))
					end
				until
					state = Void or (read_index - token_start) = l or too_big
				loop
					if state.final /= 0 then
						token_type := state.final;
						other_possible_tokens := state.final_array;
						token_end := read_index
					end;
					read_index := read_index + 1;
					if read_index > buffer_size then
						too_big := True
					else
						state := state.item (l_cat_table.item
								(buffer_item_code (read_index)))
					end
				end;
				if too_big then
					if token_start = 1 then
						buffer_resized := True;
						resize_and_fill_buffer (buffer_size + Extra_buffer_size, 0)
					else
						fill_buffer (token_start - 1);
						token_end := 0
					end;
					get_fixed_token (l)
				else
					if token_type = 0 then
						token_end := token_end + 1;
						read_index := read_index + 1
					end;
					local_string := buffer.substring (token_start, token_end);
					last_token.set (token_type,
									line_nb_array.item (token_start),
									column_nb_array.item (token_start),
									keyword_code (local_string),
									local_string)
				end
			end
			if buffer_resized then
				resize_and_fill_buffer (Standard_buffer_size, token_end);
               	token_end := 0
			end
		end;

feature -- Output

	trace
			-- Output information about the analyzer's
			-- current status.
		local
			l_dfa: like dfa
			l_cat_table: like categories_table
			i: INTEGER
		do
			l_dfa := dfa
			l_cat_table := categories_table
			from
				i := l_cat_table.lower;
				io.put_string (" LEXICAL%N Categories table.%N From ");
				io.put_integer (i)
			until
				i = l_cat_table.upper
			loop
				i := i + 1;
				if l_cat_table.item (i) /= l_cat_table.item (i - 1) then
					io.put_string (" to ");
					io.put_integer (i - 1);
					io.put_string (" ");
					io.put_integer (l_cat_table.item (i - 1));
					io.put_string ("th category.%N From ");
					io.put_integer (i)
				end
			end;
			io.put_string (" to ");
			io.put_integer (i);
			io.put_string (" ");
			io.put_integer (l_cat_table.item (i));
			io.put_string ("-th category.%N End of categories table.%N");
			l_dfa.trace;
			io.put_string (" End LEXICAL.");
			io.new_line
		end;

feature -- Obsolete

	go_on
			obsolete "Use `get_token' directly. [2017-05-31]"
		do
			from
				get_token
			until
				token_type /= 0 or end_of_text
			loop
				get_token
			end
		end;

feature {LEXICAL} -- Implementation

	initialize
			-- Create data structures for the lexical analyzer.
		do
			create_buffers (Standard_buffer_size, Standard_line_length);
			if keyword_h_table = Void then
				create keyword_h_table.make (1)
			end;
			end_of_text := False
		end;

feature {LEXICAL, LEX_BUILDER} -- Implementation

	initialize_attributes (d: FIXED_DFA; c: ARRAY [INTEGER]; k: detachable HASH_TABLE [INTEGER, STRING]; b: BOOLEAN)
			-- Set the first four attributes of Current.
		do
			dfa := d;
			categories_table := c;
			if k = Void then
				create keyword_h_table.make (1)
			else
				keyword_h_table := k;
			end
			keywords_case_sensitive := b
		end;

feature -- Implementation

	dfa: FIXED_DFA
			-- Automaton used for the parsing

feature {NONE} -- Implementation

	Standard_buffer_size: INTEGER = 10240;
			-- Standard buffer size

	Extra_buffer_size: INTEGER = 4096;
			-- size added to the initial `buffer_size' when the current token
			-- is too big.
			-- `Extra_buffer_size' should be less than `Standard_buffer_size'.

	Standard_line_length: INTEGER = 1024;
			-- Standard line length

	Max_token_length: INTEGER = 256;
			-- Maximum length for a token

	Almost_end_of_buffer: INTEGER = 9984;
			-- Buffer_size minus Max_token_length

	Close_of_file: INTEGER = 255;
			-- End-of-file indicator on some platforms

	categories_table: ARRAY [INTEGER];
			-- For each input, category number

	keyword_h_table: HASH_TABLE [INTEGER, STRING]
			-- Keywords table

	keywords_case_sensitive: BOOLEAN;
			-- Are the keyword case sensitive?

	separator_token_type : INTEGER;
			-- Type of token used as separators, (e.g. white space)

	token_end: INTEGER;
			-- Position in buffer of the end
			-- of the last recognized token

	token_start: INTEGER;
			-- Position in buffer of the beginning
			-- of the last recognized token

	lower_word: detachable STRING;
			-- String used to avoid modifying last_string_read

	read_index: INTEGER;
			-- Current position in buffer

	reset_data
		do
			read_index := 1;
			token_end := buffer_size
		end;

	is_keyword (word: STRING): BOOLEAN
			-- Is `word' a keyword included in the
			-- last token type read?
		local
			l_word: like lower_word
		do
			Result := token_type = keyword_h_table.item (word);
			if not Result and not keywords_case_sensitive then
				l_word := word.as_lower
				lower_word := l_word
				Result := token_type = keyword_h_table.item (l_word)
			end
		end;

note
	copyright:	"Copyright (c) 1984-2017, Eiffel Software and others"
	license:	"Eiffel Forum License v2 (see http://www.eiffel.com/licensing/forum.txt)"
	source: "[
			Eiffel Software
			5949 Hollister Ave., Goleta, CA 93117 USA
			Telephone 805-685-1006, Fax 805-685-6869
			Website http://www.eiffel.com
			Customer support http://support.eiffel.com
		]"

end