note
	description: "Creates a WEKA_ARFF_ATTRIBUTE from a string in arff file"
	author: "Nikolay Kazmin"
	date: "$Date$"
	revision: "$Revision$"

class
	WEKA_ARFF_ATTRIBUTE_FACTORY

inherit
	KL_SHARED_STRING_EQUALITY_TESTER

feature
	is_attribute (a_line: STRING): BOOLEAN
			-- True if a_line is an attribute line in an arff file
		do
			if a_line.starts_with ({WEKA_CONSTANTS}.attr) then
				Result := True
			end
		end

	create_attribute (a_attr_line: STRING): WEKA_ARFF_ATTRIBUTE
			-- Creates a weka_arff_attribute object by analyzing the a_line argument
		require
			line_is_an_attribute: is_attribute (a_attr_line)
		local
			l_name: STRING
			l_type: STRING
			l_set: DS_HASH_SET [STRING]
			l_line: STRING
		do
			l_line := a_attr_line.twin
			-- "@attribute".count is 10
			l_line.keep_tail (l_line.count - 10)
			l_line.prune_all_leading (' ')
			l_line.prune_all_leading ('%T')
			l_line.prune_all_trailing (' ')
			l_line.prune_all_trailing ('%T')
			l_name := parse_attr_name (l_line)
			l_type := cut_off_name (l_line)
			if l_type.has_substring ({WEKA_CONSTANTS}.numeric) then
				create {WEKA_ARFF_NUMERIC_ATTRIBUTE} Result.make (l_name)
			elseif l_type.has_substring ({WEKA_CONSTANTS}.str)  then
				create {WEKA_ARFF_STRING_ATTRIBUTE} Result.make (l_name)
			else
				l_set := nominal_values_set (l_type)
				if l_set.count = 2 and l_set.has ("True") and l_set.has ("False") then
					create {WEKA_ARFF_BOOLEAN_ATTRIBUTE} Result.make (l_name)
				else
					create {WEKA_ARFF_NOMINAL_ATTRIBUTE} Result.make (l_name, l_set)
				end
			end
		end

feature {NONE}

	nominal_values_set (a_type: STRING): DS_HASH_SET [STRING]
			-- Extracts the values list for a nominal attribute
		local
			l_values: STRING
			l_values_list: LIST [STRING]
			l_value: STRING
			l_start_index: INTEGER
		do
			create Result.make (5)
			Result.set_equality_tester (string_equality_tester)
			l_start_index := a_type.last_index_of ('{', a_type.count)
			if l_start_index > 0 then
				l_values := a_type.substring (l_start_index + 1, a_type.index_of ('}', l_start_index + 1) - 1)

				l_values_list := l_values.split (',')
				from l_values_list.start until l_values_list.after loop
					l_value := l_values_list.item_for_iteration
					l_value.prune_all_leading (' ')
					l_value.prune_all_trailing (' ')
					Result.force_last (l_value)
					l_values_list.forth
				end
			end
		end

	parse_attr_name (a_line: STRING): STRING
			--parses the name of the attribute
		require
			attribute_is_cut_out: not a_line.starts_with ({WEKA_CONSTANTS}.attr)
			spaces_are_cut_out: not a_line.starts_with (" ") and not a_line.starts_with ("%T")
		local
			l_end_index: INTEGER
		do
			l_end_index := attribute_name_end_index (a_line)
			if a_line.starts_with ("%"") then
				Result := a_line.substring (2, l_end_index-1)
			else
				Result := a_line.substring (1, l_end_index)
			end
		end

	cut_off_name (a_line: STRING): STRING
			-- removes the attribute name from the line and returns the result
		require
			attribute_is_cut_out: not a_line.starts_with ({WEKA_CONSTANTS}.attr)
			spaces_are_cut_out: not a_line.starts_with (" ") and not a_line.starts_with ("%T")
		do
			Result := a_line.substring (attribute_name_end_index (a_line) + 1, a_line.count)
			Result.prune_all_leading (' ')
			Result.prune_all_leading ('%T')
		ensure
			leading_spaces_removed: not Result.starts_with (" ") and not Result.starts_with ("%T")
			name_removed: a_line.count >= attribute_name_end_index (a_line) + Result.count
		end

	attribute_name_end_index (a_line: STRING): INTEGER
			-- finds where the attribute name ends and returns that index
		require
			attribute_is_cut_out: not a_line.starts_with ({WEKA_CONSTANTS}.attr)
			spaces_are_cut_out: not a_line.starts_with (" ") and not a_line.starts_with ("%T")
		local
			space_index, tab_index: INTEGER
		do
			if a_line.starts_with ("%"") then
				Result := a_line.index_of ('"', 2)
			else
				space_index := a_line.index_of (' ', 1) - 1
				tab_index := a_line.index_of ('%T', 1) - 1
				-- if some of the indices is -1 then we don't want to pick it so we make it huge
				if space_index = -1 then
					space_index := 10000
				end
				if tab_index = -1 then
					tab_index := 10000
				end
				-- if both of them exists then we take the minimal
				if space_index < tab_index then
					Result := space_index
				else
					Result := tab_index
				end
			end
		end
end