note description: "General utility for text processing with Unicode." class SC_LANGUAGE_UTILITY inherit SC_LANGUAGE_DATA export {SC_LANGUAGE_UTILITY} all end feature -- Access Default_separator: STRING_32 = ", " -- String used to separate strings like words. Default_newline: STRING_32 = "%N" -- Newline used for normalization. Default_source_code_language: SC_LANGUAGE -- Usual natural language of source code. once -- American English. create Result.make_with_region ("en", "US") end Default_punctuation: STRING_32 -- Standard definition of punctuation in words. once create Result.make_empty across Middle_letter as punctuation loop Result.append_character (punctuation.item) end across Middle_number_letter as punctuation loop Result.append_character (punctuation.item) end end default_words_of_text (text: READABLE_STRING_32): LIST [TUPLE [base, length: INTEGER]] -- Find default word limits of Unicode `text' for spell checking. do Result := words_of_text_with_punctuation (text, Default_punctuation) ensure bases_positive: across Result as word all word.item.base >= 1 end lengths_positive: across Result as word all word.item.length >= 1 end intervals_sorted_and_disjoint: across 1 |..| (Result.count - 1) as index all Result [index.item].base + Result [index.item].length <= Result [index.item + 1].base end all_words: across Result as word all is_default_word (text.substring (word.item.base, word.item.base + word.item.length - 1)) end end words_of_text_with_punctuation (text, punctuation: READABLE_STRING_32): LIST [TUPLE [base, length: INTEGER]] -- Find word limits of Unicode `text' for spell checking -- with characters of `punctuation' valid between letters. local left, right: TUPLE [base, length: INTEGER] do -- Only few ideas from default word boundary specification -- of Unicode Standard Annex #29 about text segmentation -- are implemented. However, algorithm would need to be adapted anyway, -- since words like "32.3" are not desired for this purpose. Result := letter_substrings (text) -- Now merge raw words across certain punctuation. -- For example, "can't" should be treated as one word, not two. Result.start from until Result.off loop left := Result.item Result.forth if not Result.off then right := Result.item if left.base + left.length + 1 = right.base then -- There is exactly one character between `left' and `right' substring. if punctuation.has (text [right.base - 1]) then -- Character belongs to word punctuation, thus merge words. Result.remove Result.back Result.replace ([left.base, left.length + right.length + 1]) end end end variant Result.count - Result.index + 1 end ensure bases_positive: across Result as word all word.item.base >= 1 end lengths_positive: across Result as word all word.item.length >= 1 end intervals_sorted_and_disjoint: across 1 |..| (Result.count - 1) as index all Result [index.item].base + Result [index.item].length <= Result [index.item + 1].base end all_words: across Result as word all is_word_with_punctuation (text.substring (word.item.base, word.item.base + word.item.length - 1), punctuation) end end is_default_word (text: READABLE_STRING_32): BOOLEAN -- Is given `text' single word using default punctuation? do Result := is_word_with_punctuation (text, Default_punctuation) ensure not_empty_word: text.is_empty implies not Result has_letter: Result implies across text as character some is_letter (character.item) end end is_word_with_punctuation (text, punctuation: READABLE_STRING_32): BOOLEAN -- Is given `text' single word? -- Characters from `punctuation' are valid between letters. local last_letter, now_letter: BOOLEAN index: INTEGER do if not text.is_empty then Result := True index := 0 from until not Result or index = text.count loop index := index + 1 now_letter := is_letter (text [index]) if index = 1 or index = text.count then Result := now_letter elseif not now_letter then -- Word punctuation has to be between two letters. if punctuation.has (text [index]) then Result := last_letter else Result := False end end last_letter := now_letter variant text.count - index end end ensure not_empty_word: text.is_empty implies not Result has_letter: Result implies across text as character some is_letter (character.item) end end segment_text (text, separator: READABLE_STRING_32): LIST [STRING_32] -- Break up `text' on `separator'. If `separator' is prefix or suffix of -- `text', then first or last segment, respectively, is empty. require separator_nonempty: not separator.is_empty local base, limit: INTEGER do create {LINKED_LIST [STRING_32]} Result.make base := 1 from until base = text.count + 1 loop -- Find next separator. limit := text.substring_index (separator, base) if limit = 0 then -- Last segment is nonempty. Result.extend (text.substring (base, text.count)) base := text.count + 1 else Result.extend (text.substring (base, limit - 1)) base := limit + separator.count end variant text.count - base + 1 end if text.is_empty or limit /= 0 then -- Last segment is empty. Result.extend ("") end ensure nonempty: not Result.is_empty inversion_correct: concatenate_texts (Result, separator) ~ text end concatenate_texts (texts: LIST [READABLE_STRING_32]; separator: READABLE_STRING_32): STRING_32 -- Concatenate `texts' with `separator' between. do Result := "" across texts as text loop -- It is separator, not terminator. if not text.is_first then Result.append (separator) end Result.append (text.item) end ensure separators_limited: Result.count >= (texts.count - 1) * separator.count end first_newline (text: READABLE_STRING_32): TUPLE [base, length: INTEGER] -- Index and length of first Unicode newline in `text', if any. -- Both zero if no newline present. Newline may be more than one character. -- Longer newlines are matched first, for example first newline -- found in "Hello%R%NWorld" is "%R%N" and not "%R". local index: INTEGER do Result := [0, 0] across Newlines as newline loop index := text.substring_index (newline.item, 1) if index /= 0 and (Result.base = 0 or index < Result.base) then -- First newline or earlier one found. Result.base := index Result.length := newline.item.count end end ensure base_nonnegative: Result.base >= 0 length_nonnegative: Result.length >= 0 no_newline: (Result.base = 0) = (Result.length = 0) valid_limits: Result.base /= 0 implies (1 <= Result.base and Result.base + Result.length <= text.count + 1) newline_present: Result.base /= 0 implies (Newlines.has (text.substring (Result.base, Result.base + Result.length - 1))) end feature {NONE} -- Implementation letter_substrings (text: READABLE_STRING_32): LIST [TUPLE [base, length: INTEGER]] -- Find substring limits of Unicode `text' only consisting of letters. local last_in_word, current_in_word: BOOLEAN do create {LINKED_LIST [TUPLE [INTEGER, INTEGER]]} Result.make across text as character loop current_in_word := is_letter (character.item) if current_in_word then if last_in_word then Result.last.length := Result.last.length + 1 else Result.extend ([character.cursor_index, 1]) end end last_in_word := current_in_word end ensure bases_positive: across Result as substring all substring.item.base >= 1 end lengths_positive: across Result as substring all substring.item.length >= 1 end substrings_sorted_disjoint_and_fewest_possible: across 1 |..| (Result.count - 1) as index all Result [index.item].base + Result [index.item].length < Result [index.item + 1].base end only_letters: across Result as substring all across substring.item.base |..| (substring.item.base + substring.item.length - 1) as index all is_letter (text [index.item]) end end end is_letter (character: CHARACTER_32): BOOLEAN -- Does Unicode `character' belong to major general category letter? do Result := contains (Letter_intervals_sorted, character.code) end contains (set: ARRAY [TUPLE [base, length: INTEGER]]; element: INTEGER): BOOLEAN -- Does `set' given by sorted and disjoint intervals contain `element'? require set_sorted_and_disjoint: across 1 |..| (set.count - 1) as index all set [index.item].base + set [index.item].length <= set [index.item + 1].base end local low, high, middle: INTEGER do -- Binary search to find interval. low := set.lower high := set.upper if set [low].base <= element then if element < set [high].base then -- Not in last interval. from invariant set.lower <= low and low < high and high <= set.upper set [low].base <= element and element < set [high].base until high - low = 1 loop middle := (low + high) // 2 -- Now low < middle < high. if set [middle].base <= element then low := middle else high := middle end variant high - low - 1 end else -- Maybe in last interval. low := high end Result := element < set [low].base + set [low].length end end invariant default_newline_is_newline: Newlines.has (Default_newline) end