note description: "[ Analyze the UnicodeData reference specification to produce helper classes that can be used by STRING_32 and CHARACTER_32 to perform some complex string operations. ]" date: "$Date$" revision: "$Revision$" class UNICODE_HELPER_GENERATOR create make feature {NONE} -- Initialization make -- Run application. local categories: like {UNICODE_CHARACTER_DATA}.category do -- Valid Unicode code points range from `0` to `0x10FFFF`. create unicode_filter.make (maximum_code_point.as_integer_32 + 1) create group.make (0) create argument_parser.make argument_parser.execute (agent do_nothing) if argument_parser.is_successful and then attached argument_parser.input_file as l_file then density := argument_parser.density read_unicode_data (l_file) if has_error then io.error.put_string_32 (l_file + ": error occured!%N") elseif attached unicode_data as l_unicode_data implies l_unicode_data.is_empty then io.error.put_string_32 (l_file + " has no unicode character data in it.%N") elseif attached argument_parser.filter_file as f then read_properties (f, agent (l, h: NATURAL_32; p: ITERABLE [STRING_8]) do -- Mark all code points between `l` and `h`. ⟳ i: l.as_integer_32 |..| h.as_integer_32 ¦ unicode_filter.set (i) ⟲ end) if has_error then io.error.put_string_32 (l_file + ": cannot read!%N") else if attached argument_parser.group_file as g then read_properties (g, agent (l, h: NATURAL_32; p: ITERABLE [STRING_8]) do -- Record groups. ⟳ n: p ¦ group.extend (l, h, n) ⟲ end) end output_filtered_data (l_unicode_data) end elseif not argument_parser.has_range then process_properties (argument_parser.property_template, l_unicode_data) elseif argument_parser.categories.is_empty then io.error.put_string_32 ("No categories have been specified.%N") else across argument_parser.categories as c loop if attached {UNICODE_CHARACTER_DATA}.category_mask [c.item] as category and then category /= 0 then categories := categories ⦶ category else io.error.put_string_32 ({STRING_32} "Unknown category: " + c.item + ". Ignoring it.") end end report_ranges (l_unicode_data, categories) end end end feature {NONE} -- Query maximum_code_point: NATURAL_32 = 0x10_FFFF -- Maximum value of a Unicode code point. feature {NONE} -- Access density: REAL_64 -- Density of the table we generate. group: ARRAYED_LIST [TUPLE [min, max: NATURAL_32; name: STRING]] -- A list of named character intervals. output_path: READABLE_STRING_32 -- Path where files will be generated. require is_successful: argument_parser.is_successful do Result := argument_parser.output_path end unicode_data: detachable ARRAYED_LIST [UNICODE_CHARACTER_DATA] note option: stable attribute end -- List collecting all the Unicode characters and their properties. unicode_table: detachable HASH_TABLE [UNICODE_CHARACTER_DATA, NATURAL_32] note option: stable attribute end -- Same as `unicode_data' but indexed by the Unicode code. unicode_filter: PACKED_BOOLEANS -- Filter for `unicode_data`. -- `True` indicates that the entry should be preserved. -- `False` indicates that the entry should be discarded. unicode_version: READABLE_STRING_32 -- Version of Unicode data. require is_successful: argument_parser.is_successful do Result := argument_parser.unicode_version end feature {NONE} -- Status Report is_statistic_requested: BOOLEAN -- Is generation of statistics required? require is_successful: argument_parser.is_successful do Result := argument_parser.has_statistic end has_error: BOOLEAN -- Did we encounter an error of some sort? feature {NONE} -- Basic operations read_unicode_data (a_file: READABLE_STRING_32) -- Read the Unicode data `a_file' and store it into `unicode_data` and `unicode_table`. local l_input: PLAIN_TEXT_FILE l_list: like unicode_data l_table: like unicode_table l_data: detachable UNICODE_CHARACTER_DATA retried: BOOLEAN do if not retried then -- First read the Unicode data file and create a list describing all -- Unicode characters. create l_input.make_with_name (a_file) l_input.open_read create l_list.make (2000) create l_table.make (2000) from l_input.read_line until l_input.end_of_file loop if attached l_input.last_string as l_line and then not l_line.is_empty and then l_line.item (1) /= '#' then create l_data.make (l_line) l_list.extend (l_data) l_table.put (l_data, l_data.code) end l_input.read_line end l_input.close -- Patch the following values for charcter to lower. l_data := l_table.item (978) if l_data /= Void and then not l_data.has_lower_code then l_data.set_lower_code (965) end l_data := l_table.item (979) if l_data /= Void and then not l_data.has_lower_code then l_data.set_lower_code (973) end l_data := l_table.item (980) if l_data /= Void and then not l_data.has_lower_code then l_data.set_lower_code (971) end -- Patch the following values for charcter to upper/title. l_data := l_table.item (912) if l_data /= Void then if not l_data.has_upper_code then l_data.set_upper_code (938) end if not l_data.has_title_code then l_data.set_title_code (938) end end l_data := l_table.item (944) if l_data /= Void then if not l_data.has_upper_code then l_data.set_upper_code (939) end if not l_data.has_title_code then l_data.set_title_code (939) end end unicode_data := l_list unicode_table := l_table else has_error := True end rescue retried := True retry end process_properties (a_template_file: READABLE_STRING_32; a_unicode_data: attached like unicode_data) -- Using `a_file' representing the Unicode standard for lower and upper tables, -- generate Eiffel code for CHARACTER_32 that will let you perform the operation -- `to_lower' and `to_upper'. We only perform simple case folding. local l_input, l_output: PLAIN_TEXT_FILE l_lowers, l_uppers, l_titles: like extract_case_ranges l_properties: like extract_case_ranges l_diffs, l_simplified_diffs: like mismatches l_tables, l_class, l_filter: STRING l_filename: PATH do -- We generate the various mapping. Those mappings are sparse. l_lowers := extract_case_ranges ("lower", a_unicode_data, agent {UNICODE_CHARACTER_DATA}.has_lower_code, agent {UNICODE_CHARACTER_DATA}.lower_code) l_uppers := extract_case_ranges ("upper", a_unicode_data, agent {UNICODE_CHARACTER_DATA}.has_upper_code, agent {UNICODE_CHARACTER_DATA}.upper_code) l_titles := extract_case_ranges ("title", a_unicode_data, agent {UNICODE_CHARACTER_DATA}.has_title_code, agent {UNICODE_CHARACTER_DATA}.title_code) l_properties := extract_case_ranges ("property", a_unicode_data, agent {UNICODE_CHARACTER_DATA}.has_property, agent {UNICODE_CHARACTER_DATA}.property_flags) -- We have noticed that the table for upper and title cases are very similar. -- As of Unicode 6.2.0, there were really 9 differences (i.e. 3 characters) that -- had a different title case than the upper case. Instead of regenerating -- almost the same data twice, we collect the differences that will be used -- to generate the title tables using the upper data and the override. l_diffs := mismatches (l_uppers, l_titles) -- Generate the code to build the tables in `l_tables'. create l_tables.make (5120) generate_case_ranges (l_tables, l_lowers, to_lower_table_name, True) generate_case_ranges (l_tables, l_uppers, to_upper_table_name, True) if l_diffs = Void then generate_case_ranges (l_tables, l_titles, to_title_table_name, True) end generate_case_ranges (l_tables, l_properties, property_table_name, False) -- Let's generate our class now. create l_input.make_with_name (a_template_file) l_input.open_read l_input.read_stream (l_input.count) l_class := l_input.last_string l_input.close if attached output_path as l_path and then not l_path.is_empty then create l_filename.make_from_string (l_path) create l_output.make_with_path (l_filename.extended (character_property_filename)) else create l_output.make_with_name (character_property_filename) end l_output.open_write l_class.replace_substring_all (generator_marker, argument_parser.name + " " + argument_parser.version) l_class.replace_substring_all (unicode_version_marker, unicode_version.to_string_8) l_class.replace_substring_all (tables_marker, l_tables) create l_filter.make (10) generate_filter (l_filter, l_lowers, to_lower_table_name, 4, True) l_class.replace_substring_all (to_lower_helper_marker, l_filter) create l_filter.make (10) generate_filter (l_filter, l_uppers, to_upper_table_name, 4, True) l_class.replace_substring_all (to_upper_helper_marker, l_filter) -- Special cases for `title' case where if there are not too many -- differences we simply generate an override. create l_filter.make (10) if l_diffs /= Void then l_simplified_diffs := title_fix_up (l_diffs) if l_simplified_diffs = l_diffs then generate_override (l_filter, l_diffs, "l_code", 4) else generate_override (l_filter, l_simplified_diffs, "Result.natural_32_code", 4) end else generate_filter (l_filter, l_titles, to_title_table_name, 4, True) end l_class.replace_substring_all (to_title_helper_marker, l_filter) create l_filter.make (10) generate_filter (l_filter, l_properties, property_table_name, 3, False) l_class.replace_substring_all (property_helper_marker, l_filter) l_output.put_string (l_class) l_output.close end extract_case_ranges ( a_table_name: STRING; a_list: ARRAYED_LIST [UNICODE_CHARACTER_DATA]; a_filter: FUNCTION [UNICODE_CHARACTER_DATA, BOOLEAN]; a_value: FUNCTION [UNICODE_CHARACTER_DATA, NATURAL_32]; ): ARRAYED_LIST [ARRAYED_LIST [TUPLE [key, value: NATURAL_32]]] -- Helper function that generate the ranges for the various conversion of -- a Unicode character to either lower, upper or title case. This function -- tries to optimize the total density so that it is no less than `density'. require a_table_name_set: a_table_name /= Void a_list_set: a_list /= Void a_filter_set: a_filter /= Void a_value_set: a_value /= Void local l_group: ARRAYED_LIST [TUPLE [key, value: NATURAL_32]] l_upper_group_code: NATURAL_32 l_total_count, l_used_space_count: NATURAL_32 l_done: BOOLEAN l_offset, l_smallest_offset: NATURAL_32 l_previous_group: detachable like extract_case_ranges l_formatter: FORMAT_DOUBLE do -- We compute `Result' iteratively until we reach a density that is no less -- than `density'. To do that, we always store the previous result as as soon -- as we go under `density' we should stop and return the previous result. -- This works because the density can only go down since our groups will contain -- larger gaps. from -- We store the previous group as a starting point. create Result.make (1) -- To compute the groups, we first group the characters so that they -- are contiguous. Then at each iteration, we calculate `l_smallest_offset' -- which is the smallest gaps between 2 groups of characters for the current -- grouping. This avoids having to increase by just one at each iteration -- for no benefits if the minimum gap is larger than one. l_offset := 1 until l_done loop -- Reset `l_smallest_value' to the maximum possible value. If this -- does not change, it means we reached the case where only one grouping is being done. l_smallest_offset := {NATURAL_32}.max_value -- Create our first group. create Result.make (1) create l_group.make (1) Result.extend (l_group) -- Reset `l_total_count' to compute the number of characters that will be in `Result'. -- Note that at each iteration it is being recomputed even though the value will always -- be the same. l_total_count := 0 -- Store the initial character code, which will be used to calculate the gap between -- groups. l_upper_group_code := 0 -- Iterate through the Unicode data. across a_list as l_char_data loop if -- Only care if the Unicode character has some case transformation, but -- since we optimized ASCII value, we ignore them. l_char_data.item.code > {CHARACTER_8}.max_ascii_value.to_natural_32 and then -- Only care about our particular case transformation a_filter.item ([l_char_data.item]) then if l_upper_group_code = 0 then -- This is the first character we encounter that have a case transformation. -- We initialize the upper bound of the group for the first time. l_upper_group_code := l_char_data.item.code end -- Increment our number of characters. l_total_count := l_total_count + 1 -- If the previous code is more than `l_offset' character away, we create a group. if l_char_data.item.code > l_upper_group_code + l_offset then -- Calcualte the gap between the previous group and this new one. l_smallest_offset := l_smallest_offset.min (l_char_data.item.code - l_upper_group_code + 1) -- Create a new group where characters will be added create l_group.make (10) Result.extend (l_group) end -- Set `l_upper_group_code' with the new upper bound of the group. l_upper_group_code := l_char_data.item.code -- Extend our character and its cased transformation in our group. l_group.extend ([l_upper_group_code, a_value.item ([l_char_data.item])]) end end -- Let's compute our density ratio and find out if we should stop or not. -- First compute the space used by our tables. l_used_space_count := 0 across Result as l_set loop if not l_set.item.is_empty then l_used_space_count := l_used_space_count + l_set.item.last.key - l_set.item.first.key + 1 end end -- If our filling density ratio is less than `density' we should stop -- and retrieve our previously computed group if any. if l_used_space_count = 0 or ((l_total_count / l_used_space_count) <= density) then l_done := True if l_previous_group /= Void then Result := l_previous_group end else -- Otherwise we continue if we haven't reached the minimum number of -- group that one can do, i.e. one. l_done := l_smallest_offset = {NATURAL_32}.max_value if not l_done then -- We are not done yet, we continue with a larger offset. l_previous_group := Result l_offset := l_smallest_offset end end end if is_statistic_requested then -- Let's compute our density ratio and find out if we should stop or not. -- First compute the space used by our tables. l_used_space_count := 0 across Result as l_set loop if not l_set.item.is_empty then l_used_space_count := l_used_space_count + l_set.item.last.key - l_set.item.first.key + 1 end end create l_formatter.make (4, 3) io.put_string ("Table " + a_table_name + " has a density of " + l_formatter.formatted ((l_total_count / l_used_space_count)) + " in " + Result.count.out + " group(s) for " + l_total_count.out + " character(s)") io.put_new_line end ensure result_set: Result /= Void end generate_case_ranges (a_output: STRING; a_ranges: like extract_case_ranges; a_table_name: STRING; is_identity: BOOLEAN) -- Generate all the tables for `a_table' in `a_output' using `a_table_name' as prefix to the table names. -- If a value is not present in the table and `is_identity' we generate the character code value, otherwise we generate `0'. require a_ranges_not_empty: across a_ranges as l_entry all not l_entry.item.is_empty end a_table_name_not_empty: not a_table_name.is_empty local l_data_type: STRING l_output, l_values: STRING i, l_row_count: INTEGER l_code: NATURAL_32 l_count: INTEGER do i := 0 across a_ranges as l_range loop i := i + 1 -- First compute the type for the current range. To optimize -- we try to fit everything into NATURAL_8, NATURAL_16, we keep NATURAL_32 -- for high Unicode values. if across l_range.item as l_val all l_val.item.value <= {NATURAL_8}.max_value end then l_data_type := "NATURAL_8" elseif across l_range.item as l_val all l_val.item.value <= {NATURAL_16}.max_value end then l_data_type := "NATURAL_16" else l_data_type := "NATURAL_32" end l_output := table_template.twin l_output.replace_substring_all ("$table_name", a_table_name + i.out) l_output.replace_substring_all ("$data_type", l_data_type) l_output.replace_substring_all ("$low", {UNICODE_CHARACTER_DATA}.hexadecimal_code_point (l_range.item.first.key)) l_output.replace_substring_all ("$high", {UNICODE_CHARACTER_DATA}.hexadecimal_code_point (l_range.item.last.key)) -- Approximation of the size of the string needed to store all the values create l_values.make ((l_range.item.last.key - l_range.item.first.key).to_integer_32 * 5) l_code := l_range.item.first.key l_row_count := 0 across l_range.item as l_char loop l_row_count := l_row_count + 1 if l_char.item.key > l_code then -- If there is a gap, we fill it from until l_code >= l_char.item.key loop if is_identity then l_values.append_natural_32 (l_code) else l_values.append_natural_8 (0) end l_count := l_count + 1 l_values.append_character (',') if l_row_count > 20 then l_values.append_character ('%N') write_tab (l_values, 4) l_row_count := 0 else l_row_count := l_row_count + 1 l_values.append_character (' ') end l_code := l_code + 1 end end -- Update to new value l_code := l_char.item.key + 1 l_values.append_natural_32 (l_char.item.value) l_count := l_count + 1 if not l_char.is_last then l_values.append_character (',') -- Insert a new line for each if l_row_count > 20 then l_values.append_character ('%N') write_tab (l_values, 4) l_row_count := 0 else l_values.append_character (' ') end end end l_output.replace_substring_all ("$values", l_values) a_output.append (l_output) a_output.append_character ('%N') a_output.append_character ('%N') end end generate_filter (a_output: STRING; a_ranges: like extract_case_ranges; a_table_name: STRING; a_nb_tab: INTEGER; is_converted_to_char: BOOLEAN) -- Generate a binary search tree of `ifs' statement to quickly access our values except for the -- first range which is checked all the time since most characters are in that range. require a_table_name_not_empty: not a_table_name.is_empty a_ranges_not_empty: not a_ranges.is_empty a_ranges_content_not_empty: across a_ranges as l_entry all not l_entry.item.is_empty end do -- We generate the if statement for the current range of values. write_tab (a_output, a_nb_tab) a_output.append ("if (") a_output.append_natural_32 (a_ranges.i_th (1).first.key) a_output.append (" <= l_code) and (l_code <= ") a_output.append_natural_32 (a_ranges.i_th (1).last.key) a_output.append (") then") a_output.append_character ('%N') write_tab (a_output, a_nb_tab + 1) a_output.append ("Result := ") a_output.append (a_table_name) a_output.append_integer (1) a_output.append (".item ((l_code - ") a_output.append_natural_32 (a_ranges.i_th (1).first.key) a_output.append (").to_integer_32)") if is_converted_to_char then a_output.append (".to_character_32") end a_output.append_character ('%N') if a_ranges.count > 2 then -- No need to generate a binary search tree if there is only one range available. write_tab (a_output, a_nb_tab) a_output.append ("else") a_output.append_character ('%N') generate_binary_search_filter (a_output, a_ranges, 2, a_ranges.count, a_table_name, a_nb_tab + 1, is_converted_to_char) end write_tab (a_output, a_nb_tab) a_output.append ("end") end generate_binary_search_filter (a_output: STRING; a_ranges: like extract_case_ranges; a_lower_bound, a_upper_bound: INTEGER; a_table_name: STRING; a_nb_tab: INTEGER; is_converted_to_char: BOOLEAN) -- Generate a binary search tree of `ifs' statement to quickly access our values. require a_table_name_not_empty: not a_table_name.is_empty a_lower_bound_not_too_small: a_lower_bound >= 1 a_upper_bound_not_too_big: a_upper_bound <= a_ranges.count valid_bounds: a_lower_bound <= a_upper_bound a_ranges_not_empty: across a_ranges as l_entry all not l_entry.item.is_empty end local l_middle: INTEGER do if a_lower_bound = a_upper_bound then -- We generate the if statement for the current range of values. write_tab (a_output, a_nb_tab) a_output.append ("if l_code >= ") a_output.append_natural_32 (a_ranges.i_th (a_lower_bound).first.key) -- Special case to generate the leaf node that check for all values greater than -- can be handled by the last range. if a_ranges.upper = a_lower_bound then a_output.append (" and l_code <= ") a_output.append_natural_32 (a_ranges.i_th (a_lower_bound).last.key) end a_output.append (" then") a_output.append_character ('%N') write_tab (a_output, a_nb_tab + 1) a_output.append ("Result := ") a_output.append (a_table_name) a_output.append_integer (a_lower_bound) a_output.append (".item ((l_code - ") a_output.append_natural_32 (a_ranges.i_th (a_lower_bound).first.key) a_output.append (").to_integer_32)") if is_converted_to_char then a_output.append (".to_character_32") end a_output.append_character ('%N') write_tab (a_output, a_nb_tab) a_output.append ("end") a_output.append_character ('%N') else l_middle := (a_lower_bound + a_upper_bound) // 2 -- We generate the if statement for the current range of values. write_tab (a_output, a_nb_tab) a_output.append ("if l_code <= ") a_output.append_natural_32 (a_ranges.i_th (l_middle).last.key) a_output.append (" then") a_output.append_character ('%N') generate_binary_search_filter (a_output, a_ranges, a_lower_bound, l_middle, a_table_name, a_nb_tab + 1, is_converted_to_char) write_tab (a_output, a_nb_tab) a_output.append ("else") a_output.append_character ('%N') generate_binary_search_filter (a_output, a_ranges, l_middle + 1, a_upper_bound, a_table_name, a_nb_tab + 1, is_converted_to_char) write_tab (a_output, a_nb_tab) a_output.append ("end") a_output.append_character ('%N') end end generate_override (a_output: STRING; a_diffs: attached like mismatches; a_variable: STRING; a_nb_tab: INTEGER) -- Generate code in `a_output` to select the new values. do write_tab (a_output, a_nb_tab) a_output.append ("Result := to_upper (Result)%N") write_tab (a_output, a_nb_tab) a_output.append ("inspect ") a_output.append (a_variable) a_output.append_character ('%N') across a_diffs as l_entry loop write_tab (a_output, a_nb_tab) a_output.append ("when ") across l_entry.item as l_codes loop a_output.append_natural_32 (l_codes.item) if not l_codes.is_last then a_output.append (", ") end end a_output.append (" then%N") write_tab (a_output, a_nb_tab + 1) a_output.append ("Result := (") a_output.append_natural_32 (l_entry.key) a_output.append (").to_character_32") a_output.append_character ('%N') end write_tab (a_output, a_nb_tab) a_output.append ("else%N") write_tab (a_output, a_nb_tab) a_output.append ("end") end read_properties (filter_file_name: READABLE_STRING_32; handle: PROCEDURE [NATURAL_32, NATURAL_32, ITERABLE [STRING_8]]) -- Read character properties from file `filter_file_name` and report them by calling `handle` -- with lower and upper code point of the range as well as the associated properties. local f: PLAIN_TEXT_FILE code: like {PLAIN_TEXT_FILE}.last_string line: like {PLAIN_TEXT_FILE}.last_string i, j: INTEGER n, m: NATURAL_32 properties: ARRAYED_LIST [STRING_8] empty_properties: ARRAYED_LIST [STRING_8] property: like {PLAIN_TEXT_FILE}.last_string do if not has_error then if filter_file_name.is_empty then f := io.input else create f.make_open_read (filter_file_name) end create empty_properties.make (0) from until f.after loop f.read_line line := f.last_string line.left_adjust if not line.is_empty and then line [1] /= '#' then i := line.index_of (';', 1) if i > 0 then code := line.substring (1, i - 1) create properties.make (1) from until i > line.count loop j := line.index_of (';', i + 1) if j = 0 then j := line.count + 1 end property := line.substring (i + 1, j - 1) property.adjust properties.extend (property) i := j end else code := line properties := empty_properties end -- Read first code point. i := 1 from n := 0 until i > code.count or else not code [i].is_hexa_digit loop n := n ⧀ 4 + code [i].to_hexa_digit i := i + 1 end if i <= 0 or i > 6 or n > maximum_code_point then has_error := True io.error.put_string ("Invalid entry format: " + f.last_string + "%N") end if i > code.count then -- There is only one code point. handle (n, n, properties) elseif code [i].is_space then -- There is only one code point followed by a sequence of white space characters. handle (n, n, properties) from until i > code.count or else not code [i].is_space loop i := i + 1 end if i < code.count then has_error := True io.error.put_string ("Invalid entry format: " + f.last_string + "%N") end elseif i + 1 >= code.count or else code [i] /= '.' or else code [i + 1] /= '.' then handle (n, n, properties) has_error := True io.error.put_string ("Invalid entry format: " + f.last_string) else -- There should be another number after "..". i := i + 2 j := i from m := 0 until i > code.count or else not code [i].is_hexa_digit loop m := m ⧀ 4 + code [i].to_hexa_digit i := i + 1 end if i <= j or i - j > 6 or m > maximum_code_point then has_error := True io.error.put_string ("Invalid entry format: " + f.last_string + "%N") end from until i > code.count or else not code [i].is_space loop i := i + 1 end if i <= code.count then has_error := True io.error.put_string ("Invalid entry format: " + f.last_string + "%N") end handle (n, m, properties) end end end if f /= io.input then f.close end end rescue has_error := True retry end output_filtered_data (d: attached like unicode_data) -- Output Unicode data `d` filtered using `unicode_filter` to the standard output. local n: BOOLEAN is_printed: BOOLEAN do n := argument_parser.is_negated across d as i loop if unicode_filter [i.item.code.as_integer_32] xor n then is_printed := False across group as g loop if g.item.min <= i.item.code and then i.item.code <= g.item.max then -- The code point is in group `g.item`, print it together with the group name. io.put_string (i.item.hexadecimal_code) io.put_character (';') io.put_string (i.item.name) io.put_character (';') io.put_string (g.item.name) io.put_new_line is_printed := True end end if not is_printed then -- Print the code point without any group name. io.put_string (i.item.hexadecimal_code) io.put_character (';') io.put_string (i.item.name) io.put_character (';') io.put_new_line end end end end feature {NONE} -- Helpers title_fix_up (a_table: attached like mismatches): attached like mismatches -- Special case of simplifying mismatch based on our knowledge that we are trying to compute -- the title case using first the upper case conversion and then patching what needs to. require unicode_table_set: unicode_table /= Void a_table_has_no_empty_list: across a_table as l_entry all not l_entry.item.is_empty end local l_list: ARRAYED_LIST [NATURAL_32] l_same: BOOLEAN l_upper_char: NATURAL_32 do create Result.make (a_table.count) l_same := True across a_table as l_entry until not l_same loop -- Compute the upper character for the set if attached unicode_table as l_unicode_table and then attached l_unicode_table.item (l_entry.item.first) as l_char then if l_char.has_upper_code then l_upper_char := l_char.upper_code else l_upper_char := l_char.code end end across l_entry.item as l_codes until not l_same loop if attached unicode_table as l_unicode_table and then attached l_unicode_table.item (l_codes.item) as l_char then if l_char.has_upper_code then l_same := l_char.upper_code = l_upper_char else l_same := l_char.code = l_upper_char end else l_same := False end end if l_same then create l_list.make (1) l_list.extend (l_upper_char) Result.extend (l_list, l_entry.key) else -- We could not optimize, keep the original set. Result.extend (l_entry.item, l_entry.key) end end end mismatches (l_table1, l_table2: like extract_case_ranges): detachable HASH_TABLE [ARRAYED_LIST [NATURAL_32], NATURAL_32] -- Compute the differences between two sets `l_table1' and `l_table2', and return the necessary information -- required to patch `l_table1' to get to `l_table2'. -- If blocks are different we return nothing as there are too many differences. local l_diffs: detachable HASH_TABLE [NATURAL_32, NATURAL_32] l_matches: detachable ARRAYED_LIST [NATURAL_32] do -- The range 10D0..10FF in Unicode 11.0.0 has different upper case, but the same title case. -- If this were taken into account, the patch could still be computed. The algorithm below should be modified to accomplish this though. -- TODO: Update the algorithm to deal with ranges present in one table and absent in the other one. if l_table1.count = l_table2.count then -- We use `l_diffs' to collect all the mismatches between the two lists. create l_diffs.make (1) from l_table1.start l_table2.start until l_table1.after or l_diffs = Void loop if l_table1.item.count = l_table2.item.count then from l_table1.item.start l_table2.item.start until l_table1.item.after loop if l_table1.item.item /~ l_table2.item.item then -- Note the different usage of `put' and `force'. -- When overriding a value in `l_table1' we use `put' -- as if we have previously entered a value for `l_table2' -- we don't want to loose it. l_diffs.put (l_table1.item.item.key, l_table1.item.item.key) l_diffs.force (l_table2.item.item.value, l_table2.item.item.key) end l_table1.item.forth l_table2.item.forth end else l_diffs := Void end l_table1.forth l_table2.forth end end -- Now that we have the list, we are going to merge the values with the same output -- in `l_table2' to create a list that will be more compact. if l_diffs /= Void then create Result.make (l_diffs.count) across l_diffs as l_entry loop l_matches := Result.item (l_entry.item) if l_matches = Void then create l_matches.make (3) Result.extend (l_matches, l_entry.item) end l_matches.extend (l_entry.key) end end end write_tab (a_string: STRING; a_nb_tab: INTEGER) -- Write `a_nb_tab' in `a_string'. local i: INTEGER do from i := a_nb_tab until i <= 0 loop a_string.append_character ('%T') i := i - 1 end end table_template: STRING_8 = "{ $table_name: SPECIAL [$data_type] -- Table for Unicode characters in the range 0x$low .. 0x$high. once Result := ({ARRAY [$data_type]} << $values >>).area end }" character_property_filename: STRING_32 = "character_property.e" -- Name of output file to_lower_table_name: STRING = "to_lower_table_" to_upper_table_name: STRING = "to_upper_table_" to_title_table_name: STRING = "to_title_table_" property_table_name: STRING = "property_table_" -- Name of various entities we generate. tables_marker: STRING = "--$TABLES" to_lower_helper_marker: STRING = "--$TO_LOWER_HELPER" to_upper_helper_marker: STRING = "--$TO_UPPER_HELPER" to_title_helper_marker: STRING = "--$TO_TITLE_HELPER" property_helper_marker: STRING = "--$PROPERTY_HELPER" unicode_version_marker: STRING = "$UNICODE_VERSION" generator_marker: STRING = "$GENERATOR" -- Various marker in the template file that will be replaced. feature {NONE} -- Ranges report_ranges (data: attached like unicode_data; categories: like {UNICODE_CHARACTER_DATA}.category) -- Collect and report ranges of code points that belong to categories. require not data.is_empty local last: like {UNICODE_CHARACTER_DATA}.code is_in_interval: BOOLEAN is_last_printed: BOOLEAN do ;(create {BUBBLE_SORTER [UNICODE_CHARACTER_DATA]}.make (create {PREDICATE_COMPARATOR [UNICODE_CHARACTER_DATA]}.make (agent (u, v: UNICODE_CHARACTER_DATA): BOOLEAN do Result := u.code < v.code end))).sort (data) is_last_printed := True across data as p loop if p.item.category ⊗ categories = 0 then -- The code point does not match the criteria. -- Print last entry in the range if not done yet. if is_in_interval then if not is_last_printed then io.put_character ('-') put_code_point (last) is_last_printed := True end io.put_new_line is_in_interval := False end elseif not is_in_interval then -- This is the first item in the interval. last := p.item.code put_code_point (last) is_in_interval := True is_last_printed := True elseif last + 1 = p.item.code then -- This is the next item in the interval, simply record it. last := last + 1 is_last_printed := False else -- The interval is over, and the new one has to be started. if not is_last_printed then io.put_character ('-') put_code_point (last) end io.put_new_line last := p.item.code put_code_point (last) is_in_interval := True is_last_printed := True end end if is_in_interval then if not is_last_printed then io.put_character ('-') put_code_point (last) end io.put_new_line end end put_code_point (n: like {UNICODE_CHARACTER_DATA}.code) -- Print code point `n` as 4 or more hexadecimal digits. do io.put_string_32 ({UNICODE_CHARACTER_DATA}.hexadecimal_code_point (n)) end feature {NONE} -- Command line processing argument_parser: ARGUMENT_PARSER; -- Parser of command line arguments. note ca_ignore: "CA011", "CA011: too many arguments" end