note description: "Unicode description of a character code." date: "$Date$" revision: "$Revision$" class UNICODE_CHARACTER_DATA create make feature {NONE} -- Initialization make (a_desc: STRING) -- Initialize Current using a line description that is made of fields separated by a semicolon, see below for descriptioon: -- Field 0: Code value, normative, Code value in 4-digit hexadecimal format. -- Field 1: Character name, normative, These names match exactly the names published in -- the Unicode Standard. -- Field 2: General Category, normative/informative, This is a useful breakdown -- into various "character types" which can be used as a default categorization -- in implementations. See below for a brief explanation. -- Field 3: Canonical Combining Classes, normative, The classes used for the Canonical -- Ordering Algorithm in the Unicode Standard. These classes are also printed -- in Chapter 4 of the Unicode Standard. -- Field 4: Bidirectional Category, normative, See the list below for an explanation of the -- abbreviations used in this field. These are the categories required by the -- Bidirectional Behavior Algorithm in the Unicode Standard. These categories are -- summarized in Chapter 3 of the Unicode Standard. -- Field 5: Character Decomposition Mapping, normative, In the Unicode Standard, -- not all of the mappings are full (maximal) decompositions. Recursive application of -- look-up for decompositions will, in all cases, lead to a maximal decomposition. -- The decomposition mappings match exactly the decomposition mappings published -- with the character names in the Unicode Standard. -- Field 6: Decimal digit value, normative, This is a numeric field. If the character -- has the decimal digit property, as specified in Chapter 4 of the Unicode Standard, -- the value of that digit is represented with an integer value in this field -- Field 7: Digit value, normative, This is a numeric field. If the character represents -- a digit, not necessarily a decimal digit, the value is here. This covers digits -- which do not form decimal radix forms, such as the compatibility superscript digits -- Field 8: Numeric value, normative, This is a numeric field. If the character has the numeric -- property, as specified in Chapter 4 of the Unicode Standard, the value of that character -- is represented with an integer or rational number in this field. This includes -- fractions as, e.g., "1/5" for U+2155 VULGAR FRACTION ONE FIFTH Also included are -- numerical values for compatibility characters such as circled numbers. -- Field 8: Mirrored, normative, If the character has been identified as a "mirrored" character -- in bidirectional text, this field has the value "Y"; otherwise "N". The list of -- mirrored characters is also printed in Chapter 4 of the Unicode Standard. -- Field 10: Unicode 1.0 Name, informative, This is the old name as published in Unicode 1.0. -- This name is only provided when it is significantly different from the Unicode -- name for the character. -- Field 11: 10646 comment field, informative, This is the ISO 10646 comment field. -- It is in parantheses in the 10646 names list. -- Field 12: Uppercase Mapping, informative, Upper case equivalent mapping. If a -- character is part of an alphabet with case distinctions, and has an upper case -- equivalent, then the upper case equivalent is in this field. See the explanation -- below on case distinctions. These mappings are always one-to-one, not -- one-to-many or many-to-one. This field is informative. -- Field 13: Lowercase Mapping, informative, Similar to Uppercase mapping -- Field 14: Titlecase Mapping, informative, Similar to Uppercase mapping local l_list: LIST [STRING] do l_list := a_desc.split (';') if l_list.count = 15 then code := to_natural (l_list.i_th (1)) name := l_list.i_th (2) general_category := l_list.i_th (3) canonical_combining_class := to_natural (l_list.i_th (4)) bidirectional_category := l_list.i_th (5) character_decomposition_tag := void_if_empty (l_list.i_th (6)) decimal_digit_value := void_if_empty (l_list.i_th (7)) digit_value := void_if_empty (l_list.i_th (8)) numeric_value := void_if_empty (l_list.i_th (9)) is_mirrored := attached l_list.i_th (10) as l_char and then l_char.count = 1 and then l_char.item (1) = 'Y' v1_name := void_if_empty (l_list.i_th (11)) iso_10646_comment := void_if_empty (l_list.i_th (12)) upper_code := to_natural (l_list.i_th (13)) lower_code := to_natural (l_list.i_th (14)) title_code := to_natural (l_list.i_th (15)) -- Let's setup our flags is_lower := general_category.same_string ("Ll") is_upper := general_category.same_string ("Lu") is_title := general_category.same_string ("Lt") is_digit := general_category.same_string ("Nd") if not is_digit and code.is_valid_character_8_code then inspect code.to_character_8 when 'a' .. 'f', 'A' .. 'F' then is_hexa_digit := True else is_hexa_digit := False end end is_punctuation := not general_category.is_empty and then general_category.item (1) = 'P' is_control := general_category.same_string ("Cc") is_space := (not general_category.is_empty and then general_category.item (1) = 'Z') or bidirectional_category.same_string ("WS") or bidirectional_category.same_string ("B") or bidirectional_category.same_string ("S") else code := 0 name := "" general_category := "" canonical_combining_class := 0 bidirectional_category := "" upper_code := 0 lower_code := 0 title_code := 0 end is_valid := not name.is_empty and then not general_category.is_empty and then not bidirectional_category.is_empty end feature -- Access code: NATURAL_32 -- Value for current character code. name: STRING -- Name of the current character. v1_name: detachable STRING -- Name of the current character in the Unicode 1.0 standard. general_category: STRING -- Name of the general category of the current character. canonical_combining_class: NATURAL_32 -- Class of the current character for the canonical ordering algorithm. bidirectional_category: STRING -- Category of the character with respect to the way characters are written -- For example in right-to-left languages, numbers will be displayed in left-to-right -- order. character_decomposition_tag: detachable STRING -- Decomposition mapping, and some time they have some tag formatting. -- For example some letters have 3 representations -- depending if the letter is at the beginning, middle or end of a word. It can be -- , , , ... decimal_digit_value: detachable STRING -- If current character represents a decimal digit, it is the value of that digit. digit_value: detachable STRING -- If current charachter represents a digit, it is the value of that digit. numeric_value: detachable STRING -- If current character has the numeric property then, it is the value of that character. -- For example, it could be "1/2", "1/3", "1", ... is_mirrored: BOOLEAN -- Is current character marked as mirrored for bidirectional text? iso_10646_comment: detachable STRING -- Comments for the ISO 10646 names. upper_code: NATURAL_32 lower_code: NATURAL_32 title_code: NATURAL_32 -- Upper, lower and title case mapping. Mapping is always one to one. -- If `0' then it is the character itself. feature {UNICODE_HELPER_GENERATOR} -- Element change set_lower_code (a_code: like lower_code) -- Set `lower_code' with `a_code'. require not_has_lower_code: not has_lower_code do lower_code := a_code ensure lower_code_set: lower_code = a_code end set_upper_code (a_code: like upper_code) -- Set `upper_code' with `a_code'. require not_has_upper_code: not has_upper_code do upper_code := a_code ensure upper_code_set: upper_code = a_code end set_title_code (a_code: like title_code) -- Set `title_code' with `a_code'. require not_has_title_code: not has_title_code do title_code := a_code ensure title_code_set: title_code = a_code end feature -- Status report is_valid: BOOLEAN -- Is Unicode data valid for Current?7 is_lower: BOOLEAN -- Is current character an upper character? is_upper: BOOLEAN -- Is current character an upper character? is_title: BOOLEAN -- Is current character an upper character? is_digit: BOOLEAN -- Is current character a digit? is_hexa_digit: BOOLEAN -- Is current an hexadecimal digit? is_punctuation: BOOLEAN -- Is current character a punctuation? is_control: BOOLEAN -- Is current a control character? is_space: BOOLEAN -- Is current a space of some sort? has_property: BOOLEAN -- Does current have one property setup? require is_valid: is_valid do Result := is_upper or is_lower or is_title or is_digit or is_hexa_digit or is_punctuation or is_control or is_space end property_flags: NATURAL_32 require is_valid: is_valid do if is_lower then Result := Result | is_lower_flag end if is_upper then Result := Result | is_upper_flag end if is_title then Result := Result | is_title_flag end if is_digit then Result := Result | is_digit_flag end if is_hexa_digit then Result := Result | is_hexa_digit_flag end if is_punctuation then Result := Result | is_punctuation_flag end if is_control then Result := Result | is_control_flag end if is_space then Result := Result | is_space_flag end end has_case: BOOLEAN -- Is current character a character with a case? require is_valid: is_valid do Result := has_lower_code or has_upper_code or has_title_code end has_lower_code: BOOLEAN -- Does current character have a corresponding lower character code associated with it? require is_valid: is_valid do Result := lower_code /= 0 and lower_code /= code end has_upper_code: BOOLEAN -- Does current character have a corresponding upper character code associated with it? require is_valid: is_valid do Result := upper_code /= 0 and upper_code /= code end has_title_code: BOOLEAN -- Does current character have a corresponding title character code associated with it? require is_valid: is_valid do Result := title_code /= 0 and title_code /= code end feature {NONE} -- Implementation void_if_empty (a_string: STRING): detachable STRING -- If `a_string' is empty, a Void value, otherwise `a_string' itself do if not a_string.is_empty then Result := a_string else Result := Void end ensure same_string_if_not_empty: not a_string.is_empty implies Result = a_string end to_natural (a_string: STRING): NATURAL_32 -- Convert `a_string' to a NATURAL_32. -- If `a_string' is empty, then 0 as a signaling value that nothing was specified. do if a_string.is_empty then Result := 0 else ctoi_convertor.reset ({NUMERIC_INFORMATION}.type_natural_32) ctoi_convertor.parse_string_with_type (a_string, {NUMERIC_INFORMATION}.type_natural_32) Result := ctoi_convertor.parsed_natural_32 end end ctoi_convertor: HEXADECIMAL_STRING_TO_INTEGER_CONVERTER -- Convertor used to convert hexadecimal string to integer or natural once create Result.make Result.set_leading_separators (" ") Result.set_trailing_separators (" ") Result.set_leading_separators_acceptable (True) Result.set_trailing_separators_acceptable (True) ensure ctoi_convertor_not_void: Result /= Void end is_upper_flag: NATURAL_8 = 0x01 is_lower_flag: NATURAL_8 = 0x02 is_title_flag: NATURAL_8 = 0x4 is_digit_flag: NATURAL_8 = 0x08 is_punctuation_flag: NATURAL_8 = 0x10 is_control_flag: NATURAL_8 = 0x20 is_hexa_digit_flag: NATURAL_8 = 0x40 is_space_flag: NATURAL_8 = 0x80 -- Various flags end