/[eiffelstudio]/FreeELKS/trunk/library/kernel/string_searcher.e
ViewVC logotype

Contents of /FreeELKS/trunk/library/kernel/string_searcher.e

Parent Directory Parent Directory | Revision Log Revision Log


Revision 91477 - (show annotations)
Sun Jan 14 09:47:13 2007 UTC (13 years ago) by ericb
File size: 9501 byte(s)
Synchronized with ISE 6.0.65740
1 indexing
2 description: "[
3 Helper to perform efficient search of a string in another one.
4 Note: The algorithm used is the one described in Communications of the ACM,
5 volume 33, number 8, August 1990, by Daniel M. Sunday. The fuzzy
6 version was presented by Peter R. Sibbald in Communications of the
7 ACM, volume 35, number 4, April 1992 (Technical Correspondance).
8 ]"
9 library: "Free implementation of ELKS library"
10 copyright: "Copyright (c) 1986-2006, Eiffel Software and others"
11 license: "Eiffel Forum License v2 (see forum.txt)"
12 date: "$Date$"
13 revision: "$Revision$"
14
15 class
16 STRING_SEARCHER
17
18 create
19 make
20
21 feature {NONE} -- Initialization
22
23 make is
24 -- Initialize current
25 do
26 create deltas.make (max_ascii_character_value + 1)
27 end
28
29 feature -- Initialization
30
31 initialize_deltas (a_pattern: STRING_GENERAL) is
32 -- Initialize `deltas' with `a_pattern'.
33 -- Optimized for ASCII characters only.
34 require
35 a_pattern_not_void: a_pattern /= Void
36 a_pattern_valid: a_pattern.is_valid_as_string_8
37 do
38 internal_initialize_deltas (a_pattern, a_pattern.count, deltas)
39 end
40
41 feature -- Access
42
43 max_ascii_character_value: INTEGER is 255
44 -- Number of characters in the extended ASCII character set.
45
46 feature -- Search
47
48 substring_index (a_string, a_pattern: STRING_GENERAL; start_pos, end_pos: INTEGER): INTEGER is
49 -- Position of first occurrence of `a_pattern' at or after `start_pos'
50 -- and before `end_pos' in `a_string'.
51 -- 0 if there are no matches.
52 require
53 a_string_not_void: a_string /= Void
54 a_pattern_not_void: a_pattern /= Void
55 a_pattern_valid: a_pattern.is_valid_as_string_8
56 start_large_enough: start_pos >= 1
57 end_pos_large_enough: start_pos <= end_pos + 1
58 end_pos_small_enough: end_pos <= a_string.count
59 do
60 if a_string = a_pattern then
61 if start_pos = 1 then
62 Result := 1
63 end
64 else
65 if a_pattern.count = 0 then
66 Result := start_pos
67 elseif a_pattern.count <= end_pos - start_pos + 1 then
68 internal_initialize_deltas (a_pattern, a_pattern.count, deltas)
69 Result := substring_index_with_deltas (a_string, a_pattern, start_pos, end_pos)
70 end
71 end
72 end
73
74 substring_index_with_deltas (a_string, a_pattern: STRING_GENERAL; start_pos, end_pos: INTEGER): INTEGER is
75 -- Position of first occurrence of `a_pattern' at or after `start_pos' in `a_string'.
76 -- 0 if there are no matches.
77 require
78 a_string_not_void: a_string /= Void
79 a_pattern_not_void: a_pattern /= Void
80 a_pattern_valid: a_pattern.is_valid_as_string_8
81 a_pattern_not_empty: not a_pattern.is_empty
82 start_large_enough: start_pos >= 1
83 end_pos_large_enough: start_pos <= end_pos + 1
84 end_pos_small_enough: end_pos <= a_string.count
85 local
86 i, j, l_end_pos: INTEGER
87 l_pattern_count: INTEGER
88 l_matched: BOOLEAN
89 l_char_code: INTEGER
90 l_deltas: like deltas
91 do
92 if a_string = a_pattern then
93 if start_pos = 1 then
94 Result := 1
95 end
96 else
97 l_pattern_count := a_pattern.count
98 check l_pattern_count_positive: l_pattern_count > 0 end
99 from
100 i := start_pos
101 l_deltas := deltas
102 l_end_pos := end_pos + 1
103 until
104 i + l_pattern_count > l_end_pos
105 loop
106 from
107 j := 0
108 l_matched := True
109 until
110 j = l_pattern_count
111 loop
112 if a_string.code (i + j) /= a_pattern.code (j + 1) then
113 -- Mismatch, so we stop
114 j := l_pattern_count - 1 -- Jump out of loop
115 l_matched := False
116 end
117 j := j + 1
118 end
119
120 if l_matched then
121 -- We got the substring
122 Result := i
123 i := l_end_pos -- Jump out of loop
124 else
125 -- Pattern was not found, shift to next location
126 if i + l_pattern_count <= end_pos then
127 l_char_code := a_string.code (i + l_pattern_count).to_integer_32
128 if l_char_code > max_ascii_character_value then
129 i := i + 1
130 else
131 i := i + l_deltas.item (l_char_code)
132 end
133 else
134 i := i + 1
135 end
136 end
137 end
138 end
139 end
140
141 fuzzy_index (a_string, a_pattern: STRING_GENERAL; start_pos, end_pos, fuzzy: INTEGER): INTEGER is
142 -- Position of first occurrence of `a_pattern' at or after `start_pos' in
143 -- `a_string' with 0..`fuzzy' mismatches between `a_string' and `a_pattern'.
144 -- 0 if there are no fuzzy matches.
145 require
146 a_string_not_void: a_string /= Void
147 a_pattern_not_void: a_pattern /= Void
148 a_pattern_valid: a_pattern.is_valid_as_string_8
149 a_pattern_not_empty: not a_pattern.is_empty
150 start_large_enough: start_pos >= 1
151 end_pos_large_enough: start_pos <= end_pos + 1
152 end_pos_small_enough: end_pos <= a_string.count
153 fuzzy_non_negative: fuzzy >= 0
154 acceptable_fuzzy: fuzzy <= a_pattern.count
155 local
156 i, j, l_min_offset, l_end_pos: INTEGER
157 l_pattern_count, l_nb_mismatched: INTEGER
158 l_matched: BOOLEAN
159 l_char_code: INTEGER
160 l_deltas_array: like deltas_array
161 do
162 if fuzzy = a_pattern.count then
163 -- More mismatches than the pattern length.
164 Result := start_pos
165 else
166 if fuzzy = 0 then
167 Result := substring_index (a_string, a_pattern, start_pos, end_pos)
168 else
169 initialize_fuzzy_deltas (a_pattern, fuzzy)
170 from
171 l_pattern_count := a_pattern.count
172 l_deltas_array := deltas_array
173 i := start_pos
174 l_end_pos := end_pos + 1
175 until
176 i + l_pattern_count > l_end_pos
177 loop
178 from
179 j := 0
180 l_nb_mismatched := 0
181 l_matched := True
182 until
183 j = l_pattern_count
184 loop
185 if a_string.code (i + j) /= a_pattern.code (j + 1) then
186 l_nb_mismatched := l_nb_mismatched + 1;
187 if l_nb_mismatched > fuzzy then
188 -- Too many mismatched, so we stop
189 j := l_pattern_count - 1 -- Jump out of loop
190 l_matched := False
191 end
192 end
193 j := j + 1
194 end
195
196 if l_matched then
197 -- We got the substring
198 Result := i
199 i := l_end_pos -- Jump out of loop
200 else
201 if i + l_pattern_count <= end_pos then
202 -- Pattern was not found, compute shift to next location
203 from
204 j := 0
205 l_min_offset := l_pattern_count + 1
206 until
207 j > fuzzy
208 loop
209 l_char_code := a_string.code (i + l_pattern_count - j).to_integer_32
210 if l_char_code > max_ascii_character_value then
211 -- No optimization for a non-extended ASCII character.
212 l_min_offset := 1
213 j := fuzzy + 1 -- Jump out of loop
214 else
215 l_min_offset := l_min_offset.min (l_deltas_array.item (j).item (l_char_code))
216 end
217 j := j + 1
218 end
219 i := i + l_min_offset
220 else
221 i := i + 1
222 end
223 end
224 end
225 deltas_array := Void
226 end
227 end
228 end
229
230 feature {NONE} -- Implementation: Access
231
232 deltas: SPECIAL [INTEGER]
233 -- Record shifting deltas.
234
235 deltas_array: SPECIAL [like deltas]
236 -- Record shifting deltas for fuzzy search.
237
238 feature {NONE} -- Implementation
239
240 internal_initialize_deltas (a_pattern: STRING_GENERAL; a_pattern_count: INTEGER; a_deltas: like deltas) is
241 -- Initialize `a_deltas' with `a_pattern'.
242 -- Optimized for ASCII characters only.
243 require
244 a_pattern_not_void: a_pattern /= Void
245 a_pattern_valid: a_pattern.is_valid_as_string_8
246 a_pattern_count_not_negative: a_pattern_count >= 0
247 a_pattern_count_valid: a_pattern_count <= a_pattern.count
248 a_deltas_not_void: a_deltas /= Void
249 a_deltas_valid: a_deltas.count = max_ascii_character_value + 1
250 local
251 i, l_char_code: INTEGER
252 do
253 -- Initialize the delta table (one more than pattern count).
254 a_deltas.fill_with (a_pattern_count + 1, 0, max_ascii_character_value)
255
256 -- Now for each character within the pattern, fill in shifting necessary
257 -- to bring the pattern to the character. The rightmost value is kept, as
258 -- we scan the pattern from left to right (ie. if there is two 'a', only the
259 -- delta associated with the second --rightmost-- will be kept).
260 from
261 i := 0
262 until
263 i = a_pattern_count
264 loop
265 l_char_code := a_pattern.code (i + 1).to_integer_32
266 if l_char_code <= max_ascii_character_value then
267 a_deltas.put (a_pattern_count - i, l_char_code)
268 end
269 i := i + 1
270 end
271 end
272
273 initialize_fuzzy_deltas (a_pattern: STRING_GENERAL; fuzzy: INTEGER) is
274 -- Compile `a_pattern' by computing the delta shift tables from a pattern
275 -- string. This has to be done before searching occurs. The first delta
276 -- table is computed with the full pattern, the second one is computed with
277 -- the rightmost character removed, and so on. A total of (fuzzy + 1) tables
278 -- are computed and stored in `deltas_array'.
279 require
280 a_pattern_not_void: a_pattern /= Void
281 a_pattern_valid: a_pattern.is_valid_as_string_8
282 fuzzy_positive: fuzzy > 0
283 local
284 l_deltas: like deltas
285 l_deltas_array: like deltas_array
286 i, l_fuzzy: INTEGER
287 l_pattern_count: INTEGER
288 do
289 from
290 l_pattern_count := a_pattern.count
291 l_fuzzy := fuzzy + 1
292 create l_deltas_array.make (l_fuzzy)
293 until
294 i = l_fuzzy
295 loop
296 create l_deltas.make (max_ascii_character_value + 1)
297 l_deltas_array.put (l_deltas, i)
298 internal_initialize_deltas (a_pattern, l_pattern_count - i, l_deltas)
299 i := i + 1
300 end
301
302 deltas_array := l_deltas_array
303 ensure
304 deltas_array_not_void: deltas_array /= Void
305 deltas_array_count_set: deltas_array.count = fuzzy + 1
306 end
307
308 invariant
309 deltas_not_void: deltas /= Void
310 deltas_valid: deltas.count = max_ascii_character_value + 1
311
312 end

Properties

Name Value
svn:eol-style native
svn:keywords Author Date Id Revision

  ViewVC Help
Powered by ViewVC 1.1.23