Inverted index: Difference between revisions
Content deleted Content added
m →{{header|REXX}}: changed/added comments and whitespace, changed indentations, changed a word (for finding). |
|||
Line 2,049: | Line 2,049: | ||
} |
} |
||
}</lang> |
}</lang> |
||
=={{header|Phix}}== |
|||
The following is included in the distro as demo\rosetta\Inverted_index.exw.<br> |
|||
Loads all text files in demo\rosetta\ and builds a list of filenames and |
|||
a dictionary of {word,file_indexes}, before a handful of quick tests.<br> |
|||
Might be better (and almost as easy) for the dictionary values to be say |
|||
{total_count, {file nos}, {file counts}}. |
|||
<lang Phix>integer word_count = 0 |
|||
sequence filenames = {} |
|||
function is_ascii(string txt) |
|||
for i=1 to length(txt) do |
|||
integer ch = txt[i] |
|||
if ch='\0' or ch>=#7F then return 0 end if |
|||
end for |
|||
return 1 |
|||
end function |
|||
procedure add_words(string name, sequence words) |
|||
filenames = append(filenames,name) |
|||
integer fn = length(filenames) |
|||
for i=1 to length(words) do |
|||
string word = words[i] |
|||
if word[1]>='a' -- skip numbers |
|||
and word[1]<='z' then |
|||
integer node = getd_index(word) |
|||
if node=0 then -- not present |
|||
setd(word,{fn}) |
|||
word_count += 1 |
|||
else |
|||
sequence val = getd_by_index(node) |
|||
if find(fn,val)=0 then |
|||
setd(word,append(val,fn)) |
|||
end if |
|||
end if |
|||
end if |
|||
end for |
|||
end procedure |
|||
procedure load_directory() |
|||
sequence d = dir(".") |
|||
for i=1 to length(d) do |
|||
if not find('d',d[i][D_ATTRIBUTES]) -- skip directories |
|||
and d[i][D_SIZE]<1024*1024*1024 then -- and files > 1GB |
|||
string name = d[i][D_NAME] |
|||
integer fn = open(name,"rb") |
|||
string txt = lower(get_text(fn)) |
|||
close(fn) |
|||
if is_ascii(txt) then -- skip any bitmaps etc |
|||
sequence words = split_any(txt,"\0\r\n\t !\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~",0,1) |
|||
add_words(name,words) |
|||
end if |
|||
end if |
|||
end for |
|||
end procedure |
|||
function lookup(sequence words) |
|||
sequence files = {} -- indexes to filenames |
|||
for i=1 to length(words) do |
|||
string word = words[i] |
|||
integer node = getd_index(word) |
|||
if node=0 then return {} end if |
|||
sequence val = getd_by_index(node) |
|||
if i=1 then |
|||
files = val |
|||
else |
|||
for j=length(files) to 1 by -1 do |
|||
if not find(files[j],val) then |
|||
files[j..j] = {} |
|||
end if |
|||
end for |
|||
if length(files)=0 then return {} end if |
|||
end if |
|||
end for |
|||
for i=1 to length(files) do |
|||
files[i] = filenames[files[i]] |
|||
end for |
|||
return files |
|||
end function |
|||
load_directory() |
|||
?word_count |
|||
?lookup({"load_directory"}) -- should only be this file |
|||
?lookup({"dir"}) -- currently two use this |
|||
?lookup({"lower"}) -- currently four use this |
|||
?lookup({"lower","dir"}) -- currently two use both |
|||
?lookup({"dir","lower"}) -- currently two use both |
|||
?lookup({"ban"&"anafish"}) -- should be none ({})</lang> |
|||
{{out}} |
|||
<pre> |
|||
3365 |
|||
{"Inverted_index.exw"} |
|||
{"Inverted_index.exw","viewppm.exw"} |
|||
{"AlmostPrime.exw","Inverted_index.exw","RockPaperScissors.exw","viewppm.exw"} |
|||
{"Inverted_index.exw","viewppm.exw"} |
|||
{"Inverted_index.exw","viewppm.exw"} |
|||
{} |
|||
</pre> |
|||
=={{header|PHP}}== |
=={{header|PHP}}== |