1 | <?php |
---|
2 | /****************************************************************************** |
---|
3 | * |
---|
4 | * $Id:$ |
---|
5 | * |
---|
6 | * Copyright (C) 1997-2003 by Dimitri van Heesch. |
---|
7 | * |
---|
8 | * Permission to use, copy, modify, and distribute this software and its |
---|
9 | * documentation under the terms of the GNU General Public License is hereby |
---|
10 | * granted. No representations are made about the suitability of this software |
---|
11 | * for any purpose. It is provided "as is" without express or implied warranty. |
---|
12 | * See the GNU General Public License for more details. |
---|
13 | * |
---|
14 | */ |
---|
15 | |
---|
16 | function readInt($file) |
---|
17 | { |
---|
18 | $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file)); |
---|
19 | $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file)); |
---|
20 | return ($b1<<24)|($b2<<16)|($b3<<8)|$b4; |
---|
21 | } |
---|
22 | |
---|
23 | function readString($file) |
---|
24 | { |
---|
25 | $result=""; |
---|
26 | while (ord($c=fgetc($file))) $result.=$c; |
---|
27 | return $result; |
---|
28 | } |
---|
29 | |
---|
30 | function readHeader($file) |
---|
31 | { |
---|
32 | $header =fgetc($file); $header.=fgetc($file); |
---|
33 | $header.=fgetc($file); $header.=fgetc($file); |
---|
34 | return $header; |
---|
35 | } |
---|
36 | |
---|
37 | function computeIndex($word) |
---|
38 | { |
---|
39 | if (strlen($word)<2) return -1; |
---|
40 | // high char of the index |
---|
41 | $hi = ord($word{0}); |
---|
42 | if ($hi==0) return -1; |
---|
43 | // low char of the index |
---|
44 | $lo = ord($word{1}); |
---|
45 | if ($lo==0) return -1; |
---|
46 | // return index |
---|
47 | return $hi*256+$lo; |
---|
48 | } |
---|
49 | |
---|
50 | function search($file,$word,&$statsList) |
---|
51 | { |
---|
52 | $index = computeIndex($word); |
---|
53 | if ($index!=-1) // found a valid index |
---|
54 | { |
---|
55 | fseek($file,$index*4+4); // 4 bytes per entry, skip header |
---|
56 | $index = readInt($file); |
---|
57 | if ($index) // found words matching first two characters |
---|
58 | { |
---|
59 | $start=sizeof($statsList); |
---|
60 | $count=$start; |
---|
61 | fseek($file,$index); |
---|
62 | $w = readString($file); |
---|
63 | while ($w) |
---|
64 | { |
---|
65 | $statIdx = readInt($file); |
---|
66 | if ($word==substr($w,0,strlen($word))) |
---|
67 | { // found word that matches (as substring) |
---|
68 | $statsList[$count++]=array( |
---|
69 | "word"=>$word, |
---|
70 | "match"=>$w, |
---|
71 | "index"=>$statIdx, |
---|
72 | "full"=>strlen($w)==strlen($word), |
---|
73 | "docs"=>array() |
---|
74 | ); |
---|
75 | } |
---|
76 | $w = readString($file); |
---|
77 | } |
---|
78 | $totalFreq=0; |
---|
79 | for ($count=$start;$count<sizeof($statsList);$count++) |
---|
80 | { |
---|
81 | $statInfo = &$statsList[$count]; |
---|
82 | fseek($file,$statInfo["index"]); |
---|
83 | $numDocs = readInt($file); |
---|
84 | $docInfo = array(); |
---|
85 | // read docs info + occurrence frequency of the word |
---|
86 | for ($i=0;$i<$numDocs;$i++) |
---|
87 | { |
---|
88 | $idx=readInt($file); |
---|
89 | $freq=readInt($file); |
---|
90 | $docInfo[$i]=array("idx"=>$idx,"freq"=>$freq,"rank"=>0.0); |
---|
91 | $totalFreq+=$freq; |
---|
92 | if ($statInfo["full"]) $totalfreq+=$freq; |
---|
93 | } |
---|
94 | // read name an url info for the doc |
---|
95 | for ($i=0;$i<$numDocs;$i++) |
---|
96 | { |
---|
97 | fseek($file,$docInfo[$i]["idx"]); |
---|
98 | $docInfo[$i]["name"]=readString($file); |
---|
99 | $docInfo[$i]["url"]=readString($file); |
---|
100 | } |
---|
101 | $statInfo["docs"]=$docInfo; |
---|
102 | } |
---|
103 | for ($count=$start;$count<sizeof($statsList);$count++) |
---|
104 | { |
---|
105 | $statInfo = &$statsList[$count]; |
---|
106 | for ($i=0;$i<sizeof($statInfo["docs"]);$i++) |
---|
107 | { |
---|
108 | $docInfo = &$statInfo["docs"]; |
---|
109 | // compute frequency rank of the word in each doc |
---|
110 | $statInfo["docs"][$i]["rank"]= |
---|
111 | (float)$docInfo[$i]["freq"]/$totalFreq; |
---|
112 | } |
---|
113 | } |
---|
114 | } |
---|
115 | } |
---|
116 | return $statsList; |
---|
117 | } |
---|
118 | |
---|
119 | function combine_results($results,&$docs) |
---|
120 | { |
---|
121 | foreach ($results as $wordInfo) |
---|
122 | { |
---|
123 | $docsList = &$wordInfo["docs"]; |
---|
124 | foreach ($docsList as $di) |
---|
125 | { |
---|
126 | $key=$di["url"]; |
---|
127 | $rank=$di["rank"]; |
---|
128 | if (in_array($key, array_keys($docs))) |
---|
129 | { |
---|
130 | $docs[$key]["rank"]+=$rank; |
---|
131 | $docs[$key]["rank"]*=2; // multiple matches increases rank |
---|
132 | } |
---|
133 | else |
---|
134 | { |
---|
135 | $docs[$key] = array("url"=>$key, |
---|
136 | "name"=>$di["name"], |
---|
137 | "rank"=>$rank |
---|
138 | ); |
---|
139 | } |
---|
140 | $docs[$key]["words"][] = array( |
---|
141 | "word"=>$wordInfo["word"], |
---|
142 | "match"=>$wordInfo["match"], |
---|
143 | "freq"=>$di["freq"] |
---|
144 | ); |
---|
145 | } |
---|
146 | } |
---|
147 | return $docs; |
---|
148 | } |
---|
149 | |
---|
150 | function normalize_ranking(&$docs) |
---|
151 | { |
---|
152 | $maxRank = 0.0000001; |
---|
153 | // compute maximal rank |
---|
154 | foreach ($docs as $doc) |
---|
155 | { |
---|
156 | if ($doc["rank"]>$maxRank) |
---|
157 | { |
---|
158 | $maxRank=$doc["rank"]; |
---|
159 | } |
---|
160 | } |
---|
161 | reset($docs); |
---|
162 | // normalize rankings |
---|
163 | while (list ($key, $val) = each ($docs)) |
---|
164 | { |
---|
165 | $docs[$key]["rank"]*=100/$maxRank; |
---|
166 | } |
---|
167 | } |
---|
168 | |
---|
169 | function filter_results($docs,&$requiredWords,&$forbiddenWords) |
---|
170 | { |
---|
171 | $filteredDocs=array(); |
---|
172 | while (list ($key, $val) = each ($docs)) |
---|
173 | { |
---|
174 | $words = &$docs[$key]["words"]; |
---|
175 | $copy=1; // copy entry by default |
---|
176 | if (sizeof($requiredWords)>0) |
---|
177 | { |
---|
178 | foreach ($requiredWords as $reqWord) |
---|
179 | { |
---|
180 | $found=0; |
---|
181 | foreach ($words as $wordInfo) |
---|
182 | { |
---|
183 | $found = $wordInfo["word"]==$reqWord; |
---|
184 | if ($found) break; |
---|
185 | } |
---|
186 | if (!$found) |
---|
187 | { |
---|
188 | $copy=0; // document contains none of the required words |
---|
189 | break; |
---|
190 | } |
---|
191 | } |
---|
192 | } |
---|
193 | if (sizeof($forbiddenWords)>0) |
---|
194 | { |
---|
195 | foreach ($words as $wordInfo) |
---|
196 | { |
---|
197 | if (in_array($wordInfo["word"],$forbiddenWords)) |
---|
198 | { |
---|
199 | $copy=0; // document contains a forbidden word |
---|
200 | break; |
---|
201 | } |
---|
202 | } |
---|
203 | } |
---|
204 | if ($copy) $filteredDocs[$key]=$docs[$key]; |
---|
205 | } |
---|
206 | return $filteredDocs; |
---|
207 | } |
---|
208 | |
---|
209 | function compare_rank($a,$b) |
---|
210 | { |
---|
211 | return ($a["rank"]>$b["rank"]) ? -1 : 1; |
---|
212 | } |
---|
213 | |
---|
214 | function sort_results($docs,&$sorted) |
---|
215 | { |
---|
216 | $sorted = $docs; |
---|
217 | usort($sorted,"compare_rank"); |
---|
218 | return $sorted; |
---|
219 | } |
---|
220 | |
---|
221 | function report_results(&$docs) |
---|
222 | { |
---|
223 | echo "<table cellspacing=\"2\">\n"; |
---|
224 | echo " <tr>\n"; |
---|
225 | echo " <td colspan=\"2\"><h2>Search Results</h2></td>\n"; |
---|
226 | echo " </tr>\n"; |
---|
227 | $numDocs = sizeof($docs); |
---|
228 | if ($numDocs==0) |
---|
229 | { |
---|
230 | echo " <tr>\n"; |
---|
231 | echo " <td colspan=\"2\">".matches_text(0)."</td>\n"; |
---|
232 | echo " </tr>\n"; |
---|
233 | } |
---|
234 | else |
---|
235 | { |
---|
236 | echo " <tr>\n"; |
---|
237 | echo " <td colspan=\"2\">".matches_text($numDocs); |
---|
238 | echo "\n"; |
---|
239 | echo " </td>\n"; |
---|
240 | echo " </tr>\n"; |
---|
241 | $num=1; |
---|
242 | foreach ($docs as $doc) |
---|
243 | { |
---|
244 | echo " <tr>\n"; |
---|
245 | echo " <td align=\"right\">$num.</td>"; |
---|
246 | echo "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n"; |
---|
247 | echo " <tr>\n"; |
---|
248 | echo " <td></td><td class=\"tiny\">Matches: "; |
---|
249 | foreach ($doc["words"] as $wordInfo) |
---|
250 | { |
---|
251 | $word = $wordInfo["word"]; |
---|
252 | $matchRight = substr($wordInfo["match"],strlen($word)); |
---|
253 | echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") "; |
---|
254 | } |
---|
255 | echo " </td>\n"; |
---|
256 | echo " </tr>\n"; |
---|
257 | $num++; |
---|
258 | } |
---|
259 | } |
---|
260 | echo "</table>\n"; |
---|
261 | } |
---|
262 | |
---|
263 | function matches_text($num) |
---|
264 | { |
---|
265 | if ($num==0) |
---|
266 | { |
---|
267 | return 'Sorry, no documents matching your query.'; |
---|
268 | } |
---|
269 | else if ($num==1) |
---|
270 | { |
---|
271 | return 'Found 1 document matching your query.'; |
---|
272 | } |
---|
273 | else // $num>1 |
---|
274 | { |
---|
275 | return 'Found '.$num.' documents matching your query. Showing best matches first.'; |
---|
276 | } |
---|
277 | } |
---|
278 | |
---|
279 | function main($idxfile) |
---|
280 | { |
---|
281 | if(strcmp('4.1.0', phpversion()) > 0) |
---|
282 | { |
---|
283 | die("Error: PHP version 4.1.0 or above required!"); |
---|
284 | } |
---|
285 | if (!($file=fopen($idxfile,"rb"))) |
---|
286 | { |
---|
287 | die("Error: Search index file could NOT be opened!"); |
---|
288 | } |
---|
289 | if (readHeader($file)!="DOXS") |
---|
290 | { |
---|
291 | die("Error: Header of index file is invalid!"); |
---|
292 | } |
---|
293 | $query=""; |
---|
294 | if (array_key_exists("query", $_GET)) |
---|
295 | { |
---|
296 | $query=$_GET["query"]; |
---|
297 | } |
---|
298 | $results = array(); |
---|
299 | $requiredWords = array(); |
---|
300 | $forbiddenWords = array(); |
---|
301 | $foundWords = array(); |
---|
302 | $word=strtolower(strtok($query," ")); |
---|
303 | while ($word) // for each word in the search query |
---|
304 | { |
---|
305 | if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; } |
---|
306 | if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; } |
---|
307 | if (!in_array($word,$foundWords)) |
---|
308 | { |
---|
309 | $foundWords[]=$word; |
---|
310 | search($file,$word,$results); |
---|
311 | } |
---|
312 | $word=strtolower(strtok(" ")); |
---|
313 | } |
---|
314 | $docs = array(); |
---|
315 | combine_results($results,$docs); |
---|
316 | // filter out documents with forbidden word or that do not contain |
---|
317 | // required words |
---|
318 | $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords); |
---|
319 | // normalize rankings so they are in the range [0-100] |
---|
320 | normalize_ranking($filteredDocs); |
---|
321 | // sort the results based on rank |
---|
322 | $sorted = array(); |
---|
323 | sort_results($filteredDocs,$sorted); |
---|
324 | // report results to the user |
---|
325 | report_results($sorted); |
---|
326 | fclose($file); |
---|
327 | } |
---|
328 | |
---|
329 | ?> |
---|