Inverted index: Difference between revisions

m
→‎{{header|Phix}}: added syntax colouring, marked p2js incompatible
m (→‎{{header|Phix}}: added syntax colouring, marked p2js incompatible)
Line 2,625:
Might be better (and almost as easy) for the dictionary values to be say
{total_count, {file nos}, {file counts}}.
<!--<lang Phix>(notonline)-- demo\rosetta\Inverted_index.exw>
<span style="color: #000080;font-style:italic;">-- demo\rosetta\Inverted_index.exw</span>
integer word_count = 0
<span style="color: #008080;">without</span> <span style="color: #008080;">js</span> <span style="color: #000080;font-style:italic;">-- (file i/o)</span>
sequence filenames = {}
<span style="color: #004080;">integer</span> <span style="color: #000000;">word_count</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
 
<span style="color: #004080;">sequence</span> <span style="color: #000000;">filenames</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
function is_ascii(string txt)
for i=1 to length(txt) do
<span style="color: #008080;">function</span> <span style="color: #000000;">is_ascii</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">txt</span><span style="color: #0000FF;">)</span>
integer ch = txt[i]
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">txt</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
if ch='\0' or ch>=#7F then return 0 end if
<span style="color: #004080;">integer</span> <span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">txt</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
end for
<span style="color: #008080;">if</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">=</span><span style="color: #008000;">'\0'</span> <span style="color: #008080;">or</span> <span style="color: #000000;">ch</span><span style="color: #0000FF;">>=</span><span style="color: #000000;">#7F</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">0</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
return 1
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
end function
<span style="color: #008080;">return</span> <span style="color: #000000;">1</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
procedure add_words(string name, sequence words)
filenames = append(filenames,name)
<span style="color: #008080;">procedure</span> <span style="color: #000000;">add_words</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">name</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">sequence</span> <span style="color: #000000;">words</span><span style="color: #0000FF;">)</span>
integer fn = length(filenames)
<span style="color: #000000;">filenames</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filenames</span><span style="color: #0000FF;">,</span><span style="color: #000000;">name</span><span style="color: #0000FF;">)</span>
for i=1 to length(words) do
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filenames</span><span style="color: #0000FF;">)</span>
string word = words[i]
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">words</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
if word[1]>='a' -- skip numbers
<span style="color: #004080;">string</span> <span style="color: #000000;">word</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">words</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
and word[1]<='z' then
<span style="color: #008080;">if</span> <span style="color: #000000;">word</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]>=</span><span style="color: #008000;">'a'</span> <span style="color: #000080;font-style:italic;">-- skip numbers</span>
integer node = getd_index(word)
<span style="color: #008080;">and</span> <span style="color: #000000;">word</span><span style="color: #0000FF;">[</span><span style="color: #000000;">1</span><span style="color: #0000FF;">]<=</span><span style="color: #008000;">'z'</span> <span style="color: #008080;">then</span>
if node=0 then -- not present
<span style="color: #004080;">integer</span> <span style="color: #000000;">node</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getd_index</span><span style="color: #0000FF;">(</span><span style="color: #000000;">word</span><span style="color: #0000FF;">)</span>
setd(word,{fn})
<span style="color: #008080;">if</span> <span style="color: #000000;">node</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- not present</span>
word_count += 1
<span style="color: #7060A8;">setd</span><span style="color: #0000FF;">(</span><span style="color: #000000;">word</span><span style="color: #0000FF;">,{</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">})</span>
else
<span style="color: #000000;">word_count</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
sequence val = getd_by_index(node)
<span if find(fn,val)style=0"color: then#008080;">else</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">val</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getd_by_index</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">)</span>
setd(word,append(val,fn))
<span style="color: #008080;">if</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">,</span><span style="color: #000000;">val</span><span style="color: #0000FF;">)=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span>
end if
<span style="color: #7060A8;">setd</span><span style="color: #0000FF;">(</span><span style="color: #000000;">word</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">val</span><span style="color: #0000FF;">,</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">))</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end procedure
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
procedure load_directory()
sequence d = dir(".")
<span style="color: #008080;">procedure</span> <span style="color: #000000;">load_directory</span><span style="color: #0000FF;">()</span>
for i=1 to length(d) do
<span style="color: #004080;">sequence</span> <span style="color: #000000;">d</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">dir</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"."</span><span style="color: #0000FF;">)</span>
if not find('d',d[i][D_ATTRIBUTES]) -- skip directories
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">d</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
and d[i][D_SIZE]<1024*1024*1024 then -- and files > 1GB
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #008000;">'d'</span><span style="color: #0000FF;">,</span><span style="color: #000000;">d</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_ATTRIBUTES</span><span style="color: #0000FF;">])</span> <span style="color: #000080;font-style:italic;">-- skip directories</span>
string name = d[i][D_NAME]
<span style="color: #008080;">and</span> <span style="color: #000000;">d</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_SIZE</span><span style="color: #0000FF;">]<</span><span style="color: #000000;">1024</span><span style="color: #0000FF;">*</span><span style="color: #000000;">1024</span><span style="color: #0000FF;">*</span><span style="color: #000000;">1024</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- and files &gt; 1GB</span>
integer fn = open(name,"rb")
<span style="color: #004080;">string</span> <span style="color: #000000;">name</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">d</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">][</span><span style="color: #004600;">D_NAME</span><span style="color: #0000FF;">]</span>
string txt = lower(get_text(fn))
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">open</span><span style="color: #0000FF;">(</span><span style="color: #000000;">name</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"rb"</span><span style="color: #0000FF;">)</span>
close(fn)
<span style="color: #004080;">string</span> <span style="color: #000000;">txt</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">lower</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">get_text</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">))</span>
if is_ascii(txt) then -- skip any bitmaps etc
<span style="color: #7060A8;">close</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
sequence words = split_any(txt,"\0\r\n\t !\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~")
<span style="color: #008080;">if</span> <span style="color: #000000;">is_ascii</span><span style="color: #0000FF;">(</span><span style="color: #000000;">txt</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span> <span style="color: #000080;font-style:italic;">-- skip any bitmaps etc</span>
add_words(name,words)
<span style="color: #004080;">sequence</span> <span style="color: #000000;">words</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">split_any</span><span style="color: #0000FF;">(</span><span style="color: #000000;">txt</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\0\r\n\t !\"#$%&\'()*+,-./:;&lt;=&gt;?@[\\]^`{|}~"</span><span style="color: #0000FF;">)</span>
end if
<span style="color: #000000;">add_words</span><span style="color: #0000FF;">(</span><span style="color: #000000;">name</span><span style="color: #0000FF;">,</span><span style="color: #000000;">words</span><span style="color: #0000FF;">)</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end procedure
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">procedure</span>
function lookup(sequence words)
sequence files = {} -- indexes to filenames
<span style="color: #008080;">function</span> <span style="color: #000000;">lookup</span><span style="color: #0000FF;">(</span><span style="color: #004080;">sequence</span> <span style="color: #000000;">words</span><span style="color: #0000FF;">)</span>
for i=1 to length(words) do
<span style="color: #004080;">sequence</span> <span style="color: #000000;">files</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span> <span style="color: #000080;font-style:italic;">-- indexes to filenames</span>
string word = words[i]
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">words</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
integer node = getd_index(word)
<span style="color: #004080;">string</span> <span style="color: #000000;">word</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">words</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
if node=0 then return {} end if
<span style="color: #004080;">integer</span> <span style="color: #000000;">node</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getd_index</span><span style="color: #0000FF;">(</span><span style="color: #000000;">word</span><span style="color: #0000FF;">)</span>
sequence val = getd_by_index(node)
<span style="color: #008080;">if</span> <span style="color: #000000;">node</span><span style="color: #0000FF;">=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #0000FF;">{}</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
if i=1 then
<span style="color: #004080;">sequence</span> <span style="color: #000000;">val</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">getd_by_index</span><span style="color: #0000FF;">(</span><span style="color: #000000;">node</span><span style="color: #0000FF;">)</span>
files = val
<span style="color: #008080;">if</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">then</span>
else
<span style="color: #000000;">files</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">val</span>
for j=length(files) to 1 by -1 do
<span style="color: #008080;">else</span>
if not find(files[j],val) then
<span style="color: #008080;">for</span> <span style="color: #000000;">j</span><span style="color: #0000FF;">=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">files</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">to</span> <span style="color: #000000;">1</span> <span style="color: #008080;">by</span> <span style="color: #0000FF;">-</span><span style="color: #000000;">1</span> <span style="color: #008080;">do</span>
files[j..j] = {}
<span style="color: #008080;">if</span> <span style="color: #008080;">not</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">files</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">],</span><span style="color: #000000;">val</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">then</span>
end if
<span style="color: #000000;">files</span><span style="color: #0000FF;">[</span><span style="color: #000000;">j</span><span style="color: #0000FF;">..</span><span style="color: #000000;">j</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
if length(files)=0 then return {} end if
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
end if
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">files</span><span style="color: #0000FF;">)=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #0000FF;">{}</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
for i=1 to length(files) do
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
files[i] = filenames[files[i]]
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">files</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
end for
<span style="color: #000000;">files</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">filenames</span><span style="color: #0000FF;">[</span><span style="color: #000000;">files</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]]</span>
return files
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
end function
<span style="color: #008080;">return</span> <span style="color: #000000;">files</span>
 
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
load_directory()
?word_count
<span style="color: #000000;">load_directory</span><span style="color: #0000FF;">()</span>
?lookup({"load_directory"}) -- should only be this file
<span style="color: #0000FF;">?</span><span style="color: #000000;">word_count</span>
?lookup({"dir"}) -- currently two use this
<span style="color: #0000FF;">?</span><span style="color: #000000;">lookup</span><span style="color: #0000FF;">({</span><span style="color: #008000;">"load_directory"</span><span style="color: #0000FF;">})</span> <span style="color: #000080;font-style:italic;">-- should only be this file</span>
?lookup({"lower"}) -- currently four use this
<span style="color: #0000FF;">?</span><span style="color: #000000;">lookup</span><span style="color: #0000FF;">({</span><span style="color: #008000;">"dir"</span><span style="color: #0000FF;">})</span> <span style="color: #000080;font-style:italic;">-- currently two use this</span>
?lookup({"lower","dir"}) -- currently two use both
<span style="color: #0000FF;">?</span><span style="color: #000000;">lookup</span><span style="color: #0000FF;">({</span><span style="color: #008000;">"lower"</span><span style="color: #0000FF;">})</span> <span style="color: #000080;font-style:italic;">-- currently four use this</span>
?lookup({"dir","lower"}) -- should be the same two
<span style="color: #0000FF;">?</span><span style="color: #000000;">lookup</span><span style="color: #0000FF;">({</span><span style="color: #008000;">"lower"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"dir"</span><span style="color: #0000FF;">})</span> <span style="color: #000080;font-style:italic;">-- currently two use both</span>
?lookup({"ban"&"anafish"}) -- should be none ({})</lang>
<span style="color: #0000FF;">?</span><span style="color: #000000;">lookup</span><span style="color: #0000FF;">({</span><span style="color: #008000;">"dir"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"lower"</span><span style="color: #0000FF;">})</span> <span style="color: #000080;font-style:italic;">-- should be the same two</span>
<span style="color: #0000FF;">?</span><span style="color: #000000;">lookup</span><span style="color: #0000FF;">({</span><span style="color: #008000;">"ban"</span><span style="color: #0000FF;">&</span><span style="color: #008000;">"anafish"</span><span style="color: #0000FF;">})</span> <span style="color: #000080;font-style:italic;">-- should be none ({})</span>
<!--</lang>-->
{{out}}
Note the distributed version has been modified and is occasionally used for real, so the output will likely differ.
<pre>
3365
7,820

edits