Inverted index: Difference between revisions

Content added Content deleted
(Rename Perl 6 -> Raku, alphabetize, minor clean-up)
(Add source for Rust)
Line 5: Line 5:


;Task:
;Task:
Given a set of text files, implement a program to create an inverted index.
Given a set of text files, implement a program to create an inverted index.


Also create a user interface to do a search using that inverted index which returns a list of files that contain the query term / terms.
Also create a user interface to do a search using that inverted index which returns a list of files that contain the query term / terms.


The search index can be in memory.
The search index can be in memory.
<br><br>
<br><br>


Line 138: Line 138:


Enter one or more words to search for; <return> to finish:
Enter one or more words to search for; <return> to finish:
it
it
Found in the following files: 0.txt, 1.txt, 2.txt
Found in the following files: 0.txt, 1.txt, 2.txt


Line 545: Line 545:
Loop, %files%, 0,1
Loop, %files%, 0,1
{
{
tooltip,%A_index% / 500
tooltip,%A_index% / 500

wordList := WordsIn(A_LoopFileFullPath)
wordList := WordsIn(A_LoopFileFullPath)
InvertedIndex(wordList, A_loopFileFullpath)
InvertedIndex(wordList, A_loopFileFullpath)
}
}


Line 566: Line 566:


WordsIn(docpath)
WordsIn(docpath)
{
{
FileRead, content, %docpath%
FileRead, content, %docpath%
spos = 1
spos = 1
Line 576: Line 576:
this_wordList .= match "`n"
this_wordList .= match "`n"
}
}

Sort, this_wordList, U
Sort, this_wordList, U
return this_wordList
return this_wordList
}
}


Line 585: Line 585:
global word2docs
global word2docs


loop, parse, words, `n,`r
loop, parse, words, `n,`r
{
{
if A_loopField =
if A_loopField =
continue
continue
Line 609: Line 609:
FileList$() = "BBCKEY0.TXT", "BBCKEY1.TXT", "BBCKEY2.TXT", \
FileList$() = "BBCKEY0.TXT", "BBCKEY1.TXT", "BBCKEY2.TXT", \
\ "BBCKEY3.TXT", "BBCKEY4.TXT"
\ "BBCKEY3.TXT", "BBCKEY4.TXT"

DictSize% = 30000
DictSize% = 30000
DIM Index{(DictSize%-1) word$, link%}
DIM Index{(DictSize%-1) word$, link%}

REM Build inverted index:
REM Build inverted index:
FOR file% = DIM(FileList$(),1) TO 0 STEP -1
FOR file% = DIM(FileList$(),1) TO 0 STEP -1
Line 618: Line 618:
F% = OPENIN(filename$)
F% = OPENIN(filename$)
IF F% = 0 ERROR 100, "Failed to open file"
IF F% = 0 ERROR 100, "Failed to open file"

WHILE NOT EOF#F%
WHILE NOT EOF#F%
REPEAT C%=BGET#F% : UNTIL C%>64 OR EOF#F% : word$ = CHR$(C%)
REPEAT C%=BGET#F% : UNTIL C%>64 OR EOF#F% : word$ = CHR$(C%)
REPEAT C%=BGET#F% : word$ += CHR$(C%) : UNTIL C%<65
REPEAT C%=BGET#F% : word$ += CHR$(C%) : UNTIL C%<65
word$ = FNlower(LEFT$(word$))
word$ = FNlower(LEFT$(word$))

hash% = FNhash(word$)
hash% = FNhash(word$)
WHILE Index{(hash%)}.word$<>"" AND Index{(hash%)}.word$<>word$
WHILE Index{(hash%)}.word$<>"" AND Index{(hash%)}.word$<>word$
Line 636: Line 636:
ENDIF
ENDIF
ENDWHILE
ENDWHILE

CLOSE #F%
CLOSE #F%
NEXT file%
NEXT file%

REM Now query the index:
REM Now query the index:
PRINT FNquery("random")
PRINT FNquery("random")
Line 646: Line 646:
PRINT FNquery("the")
PRINT FNquery("the")
END
END

DEF FNquery(A$)
DEF FNquery(A$)
LOCAL hash%, link%, temp%
LOCAL hash%, link%, temp%
Line 663: Line 663:
ENDWHILE
ENDWHILE
= LEFT$(LEFT$(A$))
= LEFT$(LEFT$(A$))

DEF FNhash(A$)
DEF FNhash(A$)
LOCAL hash%
LOCAL hash%
Line 670: Line 670:
IF LEN(A$) > 4 hash% EOR= !(!^A$ + LEN(A$) - 4)
IF LEN(A$) > 4 hash% EOR= !(!^A$ + LEN(A$) - 4)
= hash% MOD DictSize%
= hash% MOD DictSize%

DEF FNlower(A$)
DEF FNlower(A$)
LOCAL A%,C%
LOCAL A%,C%
Line 694: Line 694:
int chr_idx[256] = {0};
int chr_idx[256] = {0};
char idx_chr[256] = {0};
char idx_chr[256] = {0};

#define FNAME 0
#define FNAME 0
typedef struct trie_t *trie, trie_t;
typedef struct trie_t *trie, trie_t;
struct trie_t {
struct trie_t {
trie next[sizeof(chr_legal)]; /* next letter; slot 0 is for file name */
trie next[sizeof(chr_legal)]; /* next letter; slot 0 is for file name */
int eow;
int eow;
};
};

trie trie_new() { return calloc(sizeof(trie_t), 1); }
trie trie_new() { return calloc(sizeof(trie_t), 1); }

#define find_word(r, w) trie_trav(r, w, 1)
#define find_word(r, w) trie_trav(r, w, 1)
/* tree traversal: returns node if end of word and matches string, optionally
/* tree traversal: returns node if end of word and matches string, optionally
Line 710: Line 710:
trie trie_trav(trie root, const char * str, int no_create)
trie trie_trav(trie root, const char * str, int no_create)
{
{
int c;
int c;
while (root) {
while (root) {
if ((c = str[0]) == '\0') {
if ((c = str[0]) == '\0') {
if (!root->eow && no_create) return 0;
if (!root->eow && no_create) return 0;
break;
break;
}
}
if (! (c = chr_idx[c]) ) {
if (! (c = chr_idx[c]) ) {
str++;
str++;
continue;
continue;
}
}

if (!root->next[c]) {
if (!root->next[c]) {
if (no_create) return 0;
if (no_create) return 0;
root->next[c] = trie_new();
root->next[c] = trie_new();
}
}
root = root->next[c];
root = root->next[c];
str++;
str++;
}
}
return root;
return root;
}
}

/* complete traversal of whole tree, calling callback at each end of word node.
/* complete traversal of whole tree, calling callback at each end of word node.
* similar method can be used to free nodes, had we wanted to do that.
* similar method can be used to free nodes, had we wanted to do that.
Line 736: Line 736:
int trie_all(trie root, char path[], int depth, int (*callback)(char *))
int trie_all(trie root, char path[], int depth, int (*callback)(char *))
{
{
int i;
int i;
if (root->eow && !callback(path)) return 0;
if (root->eow && !callback(path)) return 0;

for (i = 1; i < sizeof(chr_legal); i++) {
for (i = 1; i < sizeof(chr_legal); i++) {
if (!root->next[i]) continue;
if (!root->next[i]) continue;

path[depth] = idx_chr[i];
path[depth] = idx_chr[i];
path[depth + 1] = '\0';
path[depth + 1] = '\0';
if (!trie_all(root->next[i], path, depth + 1, callback))
if (!trie_all(root->next[i], path, depth + 1, callback))
return 0;
return 0;
}
}
return 1;
return 1;
}
}

void add_index(trie root, const char *word, const char *fname)
void add_index(trie root, const char *word, const char *fname)
{
{
trie x = trie_trav(root, word, 0);
trie x = trie_trav(root, word, 0);
x->eow = 1;
x->eow = 1;

if (!x->next[FNAME])
if (!x->next[FNAME])
x->next[FNAME] = trie_new();
x->next[FNAME] = trie_new();
x = trie_trav(x->next[FNAME], fname, 0);
x = trie_trav(x->next[FNAME], fname, 0);
x->eow = 1;
x->eow = 1;
}
}

int print_path(char *path)
int print_path(char *path)
{
{
printf(" %s", path);
printf(" %s", path);
return 1;
return 1;
}
}

/* pretend we parsed text files and got lower cased words: dealing *
/* pretend we parsed text files and got lower cased words: dealing *
* with text file is a whole other animal and would make code too long */
* with text file is a whole other animal and would make code too long */
const char *files[] = { "f1.txt", "source/f2.txt", "other_file" };
const char *files[] = { "f1.txt", "source/f2.txt", "other_file" };
const char *text[][5] ={{ "it", "is", "what", "it", "is" },
const char *text[][5] ={{ "it", "is", "what", "it", "is" },
{ "what", "is", "it", 0 },
{ "what", "is", "it", 0 },
{ "it", "is", "a", "banana", 0 }};
{ "it", "is", "a", "banana", 0 }};

trie init_tables()
trie init_tables()
{
{
int i, j;
int i, j;
trie root = trie_new();
trie root = trie_new();
for (i = 0; i < sizeof(chr_legal); i++) {
for (i = 0; i < sizeof(chr_legal); i++) {
chr_idx[(int)chr_legal[i]] = i + 1;
chr_idx[(int)chr_legal[i]] = i + 1;
idx_chr[i + 1] = chr_legal[i];
idx_chr[i + 1] = chr_legal[i];
}
}


/* Enable USE_ADVANCED_FILE_HANDLING to use advanced file handling.
/* Enable USE_ADVANCED_FILE_HANDLING to use advanced file handling.
* You need to have files named like above files[], with words in them
* You need to have files named like above files[], with words in them
* like in text[][]. Case doesn't matter (told you it's advanced).
* like in text[][]. Case doesn't matter (told you it's advanced).
*/
*/
#define USE_ADVANCED_FILE_HANDLING 0
#define USE_ADVANCED_FILE_HANDLING 0
#if USE_ADVANCED_FILE_HANDLING
#if USE_ADVANCED_FILE_HANDLING
void read_file(const char * fname) {
void read_file(const char * fname) {
char cmd[1024];
char cmd[1024];
char word[1024];
char word[1024];
sprintf(cmd, "perl -p -e 'while(/(\\w+)/g) {print lc($1),\"\\n\"}' %s", fname);
sprintf(cmd, "perl -p -e 'while(/(\\w+)/g) {print lc($1),\"\\n\"}' %s", fname);
FILE *in = popen(cmd, "r");
FILE *in = popen(cmd, "r");
while (!feof(in)) {
while (!feof(in)) {
fscanf(in, "%1000s", word);
fscanf(in, "%1000s", word);
add_index(root, word, fname);
add_index(root, word, fname);
}
}
pclose(in);
pclose(in);
};
};


read_file("f1.txt");
read_file("f1.txt");
read_file("source/f2.txt");
read_file("source/f2.txt");
read_file("other_file");
read_file("other_file");
#else
#else
for (i = 0; i < 3; i++) {
for (i = 0; i < 3; i++) {
for (j = 0; j < 5; j++) {
for (j = 0; j < 5; j++) {
if (!text[i][j]) break;
if (!text[i][j]) break;
add_index(root, text[i][j], files[i]);
add_index(root, text[i][j], files[i]);
}
}
}
}
#endif /*USE_ADVANCED_FILE_HANDLING*/
#endif /*USE_ADVANCED_FILE_HANDLING*/


return root;
return root;
}
}

void search_index(trie root, const char *word)
void search_index(trie root, const char *word)
{
{
char path[1024];
char path[1024];
printf("Search for \"%s\": ", word);
printf("Search for \"%s\": ", word);
trie found = find_word(root, word);
trie found = find_word(root, word);

if (!found) printf("not found\n");
if (!found) printf("not found\n");
else {
else {
trie_all(found->next[FNAME], path, 0, print_path);
trie_all(found->next[FNAME], path, 0, print_path);
printf("\n");
printf("\n");
}
}
}
}

int main()
int main()
{
{
trie root = init_tables();
trie root = init_tables();

search_index(root, "what");
search_index(root, "what");
search_index(root, "is");
search_index(root, "is");
search_index(root, "banana");
search_index(root, "banana");
search_index(root, "boo");
search_index(root, "boo");
return 0;
return 0;
}</lang>Output:<lang>Search for "what": f1.txt source/f2.txt
}</lang>Output:<lang>Search for "what": f1.txt source/f2.txt
Search for "is": f1.txt other_file source/f2.txt
Search for "is": f1.txt other_file source/f2.txt
Line 889: Line 889:
{
{
public:
public:
node() { clear(); }
node() { clear(); }
node( char z ) { clear(); }
node( char z ) { clear(); }
~node() { for( int x = 0; x < MAX_NODES; x++ ) if( next[x] ) delete next[x]; }
~node() { for( int x = 0; x < MAX_NODES; x++ ) if( next[x] ) delete next[x]; }
Line 933: Line 933:
const std::vector<std::string>& find( std::string s ) {
const std::vector<std::string>& find( std::string s ) {
size_t idx;
size_t idx;
std::transform( s.begin(), s.end(), s.begin(), tolower );
std::transform( s.begin(), s.end(), s.begin(), tolower );
node* rt = &root;
node* rt = &root;
for( std::string::iterator i = s.begin(); i != s.end(); i++ ) {
for( std::string::iterator i = s.begin(); i != s.end(); i++ ) {
idx = _CHARS.find( *i );
idx = _CHARS.find( *i );
if( idx < MAX_NODES ) {
if( idx < MAX_NODES ) {
if( !rt->next[idx] ) return std::vector<std::string>();
if( !rt->next[idx] ) return std::vector<std::string>();
rt = rt->next[idx];
rt = rt->next[idx];
}
}
}
}
if( rt->isWord ) return rt->files;
if( rt->isWord ) return rt->files;
return std::vector<std::string>();
return std::vector<std::string>();
Line 951: Line 951:
idx = _CHARS.find( *i );
idx = _CHARS.find( *i );
if( idx < MAX_NODES ) {
if( idx < MAX_NODES ) {
n = rt->next[idx];
n = rt->next[idx];
if( n ){
if( n ){
rt = n;
rt = n;
continue;
continue;
}
}
n = new node( *i );
n = new node( *i );
rt->next[idx] = n;
rt->next[idx] = n;
rt = n;
rt = n;
}
}
}
}
Line 983: Line 983:
}
}
}
}

while( true ) {
while( true ) {
std::cout << "Enter one word to search for, return to exit: ";
std::cout << "Enter one word to search for, return to exit: ";
Line 1,023: Line 1,023:
(defn term-seq [text] (map normalize (re-seq pattern text)))
(defn term-seq [text] (map normalize (re-seq pattern text)))


(defn set-assoc
(defn set-assoc
"Produces map with v added to the set associated with key k in map m"
"Produces map with v added to the set associated with key k in map m"
[m k v] (assoc m k (conj (get m k #{}) v)))
[m k v] (assoc m k (conj (get m k #{}) v)))
Line 1,064: Line 1,064:
console.log "#{fn}:#{line_num}"
console.log "#{fn}:#{line_num}"
console.log "\n"
console.log "\n"

get_words = (line) ->
get_words = (line) ->
words = line.replace(/\W/g, ' ').split ' '
words = line.replace(/\W/g, ' ').split ' '
Line 1,081: Line 1,081:
output
output
<lang>
<lang>
> coffee inverted_index.coffee
> coffee inverted_index.coffee
locations for 'make_index':
locations for 'make_index':
inverted_index.coffee:3
inverted_index.coffee:3
Line 1,211: Line 1,211:


;; get text for each file, and call (action filename text)
;; get text for each file, and call (action filename text)
(define (map-files action files)
(define (map-files action files)
(for ((file files))
(for ((file files))
(file->string action file)))
(file->string action file)))

;; create store
;; create store
(local-make-store INVERT)
(local-make-store INVERT)


; invert-word : word -> set of files
; invert-word : word -> set of files
(define (invert-word word file store)
(define (invert-word word file store)
(local-put-value word
(local-put-value word
(make-set (cons file (local-get-value word store))) store))
(make-set (cons file (local-get-value word store))) store))

; parse file text and invert each word
; parse file text and invert each word
(define (invert-text file text)
(define (invert-text file text)
(writeln 'Inverting file text)
(writeln 'Inverting file text)
(let ((text (text-parse text)))
(let ((text (text-parse text)))
(for ((word text)) (invert-word (string-downcase word) file INVERT))))
(for ((word text)) (invert-word (string-downcase word) file INVERT))))
</lang>
</lang>


Line 1,233: Line 1,233:
Intersect sets values of each word.
Intersect sets values of each word.
<lang lisp>
<lang lisp>
;; usage : (inverted-search w1 w2 ..)
;; usage : (inverted-search w1 w2 ..)
(define-syntax-rule (inverted-search w ...)
(define-syntax-rule (inverted-search w ...)
(and-get-invert (quote w )))
(and-get-invert (quote w )))


;; intersects all sets referenced by words
;; intersects all sets referenced by words
;; returns the intersection
;; returns the intersection
(define (and-get-invert words)
(define (and-get-invert words)
(foldl
(foldl
(lambda(word res)
(lambda(word res)
(set-intersect res (local-get-value word INVERT)))
(set-intersect res (local-get-value word INVERT)))
FILES words))
FILES words))
</lang>
</lang>
Output :
Output :
Line 1,271: Line 1,271:
lists:foldl( fun import_from_file/2, dict:new(), Files ).
lists:foldl( fun import_from_file/2, dict:new(), Files ).


search( Binaries, Inverted_index ) ->
search( Binaries, Inverted_index ) ->
[Files | T] = [dict:fetch(X, Inverted_index) || X <- Binaries],
[Files | T] = [dict:fetch(X, Inverted_index) || X <- Binaries],
lists:foldl( fun search_common/2, Files, T ).
lists:foldl( fun search_common/2, Files, T ).
Line 1,287: Line 1,287:
import_from_file( File, Dict_acc ) ->
import_from_file( File, Dict_acc ) ->
New_dict = dict:from_list( import_from_file_contents(File, file:read_file(File)) ),
New_dict = dict:from_list( import_from_file_contents(File, file:read_file(File)) ),
dict:merge( fun import_from_file_merge/3, Dict_acc, New_dict ).
dict:merge( fun import_from_file_merge/3, Dict_acc, New_dict ).


import_from_file_contents( File, {ok, Binary} ) ->
import_from_file_contents( File, {ok, Binary} ) ->
[{X, [File]} || X <- binary:split( Binary, binary:compile_pattern([<<" ">>, <<"\n">>]), [global] )];
[{X, [File]} || X <- binary:split( Binary, binary:compile_pattern([<<" ">>, <<"\n">>]), [global] )];
import_from_file_contents( File, {error, Error} ) ->
import_from_file_contents( File, {error, Error} ) ->
io:fwrite( "Error: could not open file ~p: ~p~nContinuing with the rest of them~n", [File, Error] ),
io:fwrite( "Error: could not open file ~p: ~p~nContinuing with the rest of them~n", [File, Error] ),
[].
[].


import_from_file_merge( _Key, Files, [New_file] ) -> [New_file | Files].
import_from_file_merge( _Key, Files, [New_file] ) -> [New_file | Files].


search_common( Files, Acc ) -> [X || X <- Acc, lists:member(X, Files)].
search_common( Files, Acc ) -> [X || X <- Acc, lists:member(X, Files)].
Line 1,306: Line 1,306:
// Map search terms to associated set of files
// Map search terms to associated set of files
type searchIndexMap = Map<string, Set<string>>
type searchIndexMap = Map<string, Set<string>>

let inputSearchCriteria() =
let inputSearchCriteria() =
let readLine prompt =
let readLine prompt =
printf "%s: " prompt
printf "%s: " prompt
Console.ReadLine().Split()
Console.ReadLine().Split()

readLine "Files", (readLine "Find") |> Array.map (fun s -> s.ToLower())
readLine "Files", (readLine "Find") |> Array.map (fun s -> s.ToLower())

let updateIndex indexMap keyValuePair =
let updateIndex indexMap keyValuePair =
let k, v = keyValuePair
let k, v = keyValuePair

match Map.tryFind k indexMap with
match Map.tryFind k indexMap with
| None -> Map.add k (Set.singleton v) indexMap
| None -> Map.add k (Set.singleton v) indexMap
| Some set -> Map.add k (Set.add v set) indexMap
| Some set -> Map.add k (Set.add v set) indexMap

let buildIndex files =
let buildIndex files =
let fileData file =
let fileData file =
File.ReadAllText(file).Split() |> Seq.map (fun word -> word.ToLower(), file)
File.ReadAllText(file).Split() |> Seq.map (fun word -> word.ToLower(), file)

files |> Seq.collect fileData
files |> Seq.collect fileData
|> Seq.fold updateIndex Map.empty
|> Seq.fold updateIndex Map.empty

let searchFiles() =
let searchFiles() =
let files, terms = inputSearchCriteria()
let files, terms = inputSearchCriteria()
Line 1,481: Line 1,481:
}
}
return true
return true
}
}

func ui() {
func ui() {
fmt.Println(len(index), "words indexed in", len(indexed), "files")
fmt.Println(len(index), "words indexed in", len(indexed), "files")
Line 1,500: Line 1,500:
fmt.Println("one match:")
fmt.Println("one match:")
fmt.Println(" ", indexed[dl[0]].file, indexed[dl[0]].title)
fmt.Println(" ", indexed[dl[0]].file, indexed[dl[0]].title)
default:
default:
fmt.Println(len(dl), "matches:")
fmt.Println(len(dl), "matches:")
for _, d := range dl {
for _, d := range dl {
Line 1,580: Line 1,580:


every textname := key(texts) do # build index for each 'text'
every textname := key(texts) do # build index for each 'text'
SII := InvertedIndex(SII,textname,texts[textname])
SII := InvertedIndex(SII,textname,texts[textname])

TermSearchUI(SII) # search UI
TermSearchUI(SII) # search UI

end
end


Line 1,601: Line 1,601:
repeat {
repeat {
writes("Enter search terms (^z to quit) : ")
writes("Enter search terms (^z to quit) : ")
terms := map(trim(read() | break))
terms := map(trim(read() | break))


x := []
x := []
terms ? while not pos(0) do {
terms ? while not pos(0) do {
tab(many(' \t'))
tab(many(' \t'))
put(x,tab(upto('\ \t')|0))
put(x,tab(upto('\ \t')|0))
}
}

show("Searching for : ",x)
show("Searching for : ",x)
show("Found in : ",s := TermSearch(ii,x)) | show("Not found : ",x)
show("Found in : ",s := TermSearch(ii,x)) | show("Not found : ",x)
}
}
write("End of search")
write("End of search")
Line 1,617: Line 1,617:


procedure TermSearch(ii,x) #: return set of matches or fail
procedure TermSearch(ii,x) #: return set of matches or fail
every s := !x do
every s := !x do
( /u := ii[s] ) | (u **:= ii[s])
( /u := ii[s] ) | (u **:= ii[s])
if *u > 0 then return u
if *u > 0 then return u
Line 1,625: Line 1,625:
every writes(s|!x) do writes(" ")
every writes(s|!x) do writes(" ")
write()
write()
return
return
end</lang>
end</lang>


Line 1,722: Line 1,722:
public class InvertedIndex {
public class InvertedIndex {


List<String> stopwords = Arrays.asList("a", "able", "about",
List<String> stopwords = Arrays.asList("a", "able", "about",
"across", "after", "all", "almost", "also", "am", "among", "an",
"across", "after", "all", "almost", "also", "am", "among", "an",
"and", "any", "are", "as", "at", "be", "because", "been", "but",
"and", "any", "are", "as", "at", "be", "because", "been", "but",
"by", "can", "cannot", "could", "dear", "did", "do", "does",
"by", "can", "cannot", "could", "dear", "did", "do", "does",
"either", "else", "ever", "every", "for", "from", "get", "got",
"either", "else", "ever", "every", "for", "from", "get", "got",
"had", "has", "have", "he", "her", "hers", "him", "his", "how",
"had", "has", "have", "he", "her", "hers", "him", "his", "how",
"however", "i", "if", "in", "into", "is", "it", "its", "just",
"however", "i", "if", "in", "into", "is", "it", "its", "just",
"least", "let", "like", "likely", "may", "me", "might", "most",
"least", "let", "like", "likely", "may", "me", "might", "most",
"must", "my", "neither", "no", "nor", "not", "of", "off", "often",
"must", "my", "neither", "no", "nor", "not", "of", "off", "often",
"on", "only", "or", "other", "our", "own", "rather", "said", "say",
"on", "only", "or", "other", "our", "own", "rather", "said", "say",
"says", "she", "should", "since", "so", "some", "than", "that",
"says", "she", "should", "since", "so", "some", "than", "that",
"the", "their", "them", "then", "there", "these", "they", "this",
"the", "their", "them", "then", "there", "these", "they", "this",
"tis", "to", "too", "twas", "us", "wants", "was", "we", "were",
"tis", "to", "too", "twas", "us", "wants", "was", "we", "were",
"what", "when", "where", "which", "while", "who", "whom", "why",
"what", "when", "where", "which", "while", "who", "whom", "why",
"will", "with", "would", "yet", "you", "your");
"will", "with", "would", "yet", "you", "your");


Map<String, List<Tuple>> index = new HashMap<String, List<Tuple>>();
Map<String, List<Tuple>> index = new HashMap<String, List<Tuple>>();
List<String> files = new ArrayList<String>();
List<String> files = new ArrayList<String>();


public void indexFile(File file) throws IOException {
public void indexFile(File file) throws IOException {
int fileno = files.indexOf(file.getPath());
int fileno = files.indexOf(file.getPath());
if (fileno == -1) {
if (fileno == -1) {
files.add(file.getPath());
files.add(file.getPath());
fileno = files.size() - 1;
fileno = files.size() - 1;
}
}


int pos = 0;
int pos = 0;
BufferedReader reader = new BufferedReader(new FileReader(file));
BufferedReader reader = new BufferedReader(new FileReader(file));
for (String line = reader.readLine(); line != null; line = reader
for (String line = reader.readLine(); line != null; line = reader
.readLine()) {
.readLine()) {
for (String _word : line.split("\\W+")) {
for (String _word : line.split("\\W+")) {
String word = _word.toLowerCase();
String word = _word.toLowerCase();
pos++;
pos++;
if (stopwords.contains(word))
if (stopwords.contains(word))
continue;
continue;
List<Tuple> idx = index.get(word);
List<Tuple> idx = index.get(word);
if (idx == null) {
if (idx == null) {
idx = new LinkedList<Tuple>();
idx = new LinkedList<Tuple>();
index.put(word, idx);
index.put(word, idx);
}
}
idx.add(new Tuple(fileno, pos));
idx.add(new Tuple(fileno, pos));
}
}
}
}
System.out.println("indexed " + file.getPath() + " " + pos + " words");
System.out.println("indexed " + file.getPath() + " " + pos + " words");
}
}


public void search(List<String> words) {
public void search(List<String> words) {
for (String _word : words) {
for (String _word : words) {
Set<String> answer = new HashSet<String>();
Set<String> answer = new HashSet<String>();
String word = _word.toLowerCase();
String word = _word.toLowerCase();
List<Tuple> idx = index.get(word);
List<Tuple> idx = index.get(word);
if (idx != null) {
if (idx != null) {
for (Tuple t : idx) {
for (Tuple t : idx) {
answer.add(files.get(t.fileno));
answer.add(files.get(t.fileno));
}
}
}
}
System.out.print(word);
System.out.print(word);
for (String f : answer) {
for (String f : answer) {
System.out.print(" " + f);
System.out.print(" " + f);
}
}
System.out.println("");
System.out.println("");
}
}
}
}


public static void main(String[] args) {
public static void main(String[] args) {
try {
try {
InvertedIndex idx = new InvertedIndex();
InvertedIndex idx = new InvertedIndex();
for (int i = 1; i < args.length; i++) {
for (int i = 1; i < args.length; i++) {
idx.indexFile(new File(args[i]));
idx.indexFile(new File(args[i]));
}
}
idx.search(Arrays.asList(args[0].split(",")));
idx.search(Arrays.asList(args[0].split(",")));
} catch (Exception e) {
} catch (Exception e) {
e.printStackTrace();
e.printStackTrace();
}
}
}
}


private class Tuple {
private class Tuple {
private int fileno;
private int fileno;
private int position;
private int position;


public Tuple(int fileno, int position) {
public Tuple(int fileno, int position) {
this.fileno = fileno;
this.fileno = fileno;
this.position = position;
this.position = position;
}
}
}
}
}
}


Line 1,813: Line 1,813:
Example output:
Example output:
<lang Java>
<lang Java>
java -cp bin org.rosettacode.InvertedIndex "huntsman,merit,dog,the,gutenberg,lovecraft,olympian" pg30637.txt pg7025.txt pg82.txt pg9090.txt
java -cp bin org.rosettacode.InvertedIndex "huntsman,merit,dog,the,gutenberg,lovecraft,olympian" pg30637.txt pg7025.txt pg82.txt pg9090.txt
indexed pg30637.txt 106473 words
indexed pg30637.txt 106473 words
indexed pg7025.txt 205714 words
indexed pg7025.txt 205714 words
Line 1,848: Line 1,848:
def overlap(that): . as $this
def overlap(that): . as $this
| reduce that[] as $item ([]; if $this|index($item) then . + [$item] else . end);
| reduce that[] as $item ([]; if $this|index($item) then . + [$item] else . end);

. as $dict
. as $dict
| if (words|length) == 0 then []
| if (words|length) == 0 then []
Line 1,864: Line 1,864:
<lang jq>def prompt_search:
<lang jq>def prompt_search:
"Enter a string or an array of strings to search for, quoting each string, or 0 to exit:",
"Enter a string or an array of strings to search for, quoting each string, or 0 to exit:",
( (input | if type == "array" then . elif type == "string" then [.]
( (input | if type == "array" then . elif type == "string" then [.]
else empty
else empty
end) as $in
end) as $in
Line 1,997: Line 1,997:


Enter word(s) to be searched for in these files or 'q' to quit
Enter word(s) to be searched for in these files or 'q' to quit
? : cat
? : cat


'cat' not found
'cat' not found
Line 2,046: Line 2,046:
unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma \
unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma \
inv.ml
inv.ml

ocamlc -o inv.byte unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma inv.cmo
ocamlc -o inv.byte unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma inv.cmo


Line 2,143: Line 2,143:
foreach my $file (@files)
foreach my $file (@files)
{
{
open(F, "<", $file) or die "Can't read file $file: $!";
open(F, "<", $file) or die "Can't read file $file: $!";
while(<F>) {
while(<F>) {
s/\A\W+//;
s/\A\W+//;
foreach my $w (map {lc} grep {length() >= 3} split /\W+/)
foreach my $w (map {lc} grep {length() >= 3} split /\W+/)
{
{
if ( exists($iindex{$w}) )
if ( exists($iindex{$w}) )
{
{
$iindex{$w}->insert($file);
$iindex{$w}->insert($file);
} else {
} else {
$iindex{$w} = set($file);
$iindex{$w} = set($file);
}
}
}
}
}
}
close(F);
close(F);
}
}
return %iindex;
return %iindex;
Line 2,167: Line 2,167:
my @words = @_;
my @words = @_;
my $res = set();
my $res = set();

foreach my $w (map {lc} @words)
foreach my $w (map {lc} @words)
{
{
$w =~ s/\W+//g; # strip non-words chars
$w =~ s/\W+//g; # strip non-words chars
length $w < 3 and next;
length $w < 3 and next;
exists $idx{$w} or return set();
exists $idx{$w} or return set();
Line 2,189: Line 2,189:
=={{header|Phix}}==
=={{header|Phix}}==
The following is included in the distro as demo\rosetta\Inverted_index.exw.<br>
The following is included in the distro as demo\rosetta\Inverted_index.exw.<br>
Loads all text files in demo\rosetta\ and builds a list of filenames and
Loads all text files in demo\rosetta\ and builds a list of filenames and
a dictionary of {word,file_indexes}, before a handful of quick tests.<br>
a dictionary of {word,file_indexes}, before a handful of quick tests.<br>
Might be better (and almost as easy) for the dictionary values to be say
Might be better (and almost as easy) for the dictionary values to be say
Line 2,376: Line 2,376:
# Create index hashtable, as needed
# Create index hashtable, as needed
If ( -not $Script:WordIndex ) { $Script:WordIndex = @{} }
If ( -not $Script:WordIndex ) { $Script:WordIndex = @{} }

# For each file to be indexed...
# For each file to be indexed...
ForEach ( $File in $FileList )
ForEach ( $File in $FileList )
Line 2,382: Line 2,382:
# Find any previously existing entries for this file
# Find any previously existing entries for this file
$ExistingEntries = $Script:WordIndex.Keys | Where { $Script:WordIndex[$_] -contains $File }
$ExistingEntries = $Script:WordIndex.Keys | Where { $Script:WordIndex[$_] -contains $File }

# For any previously existing entries
# For any previously existing entries
# Delete them (prior to reindexing the file)
# Delete them (prior to reindexing the file)
Line 2,389: Line 2,389:
$Script:WordIndex[$Key] = @( $Script:WordIndex[$Key] | Where { $_ -ne $File } )
$Script:WordIndex[$Key] = @( $Script:WordIndex[$Key] | Where { $_ -ne $File } )
}
}

# Get the contents of the file, split on non-alphanumeric characters, and remove duplicates
# Get the contents of the file, split on non-alphanumeric characters, and remove duplicates
$Words = ( Get-Content $File ) -split '[^a-zA-Z\d]' | Sort -Unique
$Words = ( Get-Content $File ) -split '[^a-zA-Z\d]' | Sort -Unique

# For each word in the file...
# For each word in the file...
ForEach ( $Word in $Words )
ForEach ( $Word in $Words )
Line 2,410: Line 2,410:
}
}
}
}

function Find-Word ( [string]$Word )
function Find-Word ( [string]$Word )
{
{
Line 2,420: Line 2,420:
various words.
various words.
'@ | Out-File -FilePath C:\Test\File1.txt
'@ | Out-File -FilePath C:\Test\File1.txt

@'
@'
Create an index
Create an index
of words.
of words.
'@ | Out-File -FilePath C:\Test\File2.txt
'@ | Out-File -FilePath C:\Test\File2.txt

@'
@'
Use the index
Use the index
Line 2,628: Line 2,628:
{{works with|rakudo|2015-09-16}}
{{works with|rakudo|2015-09-16}}
<lang perl6>sub MAIN (*@files) {
<lang perl6>sub MAIN (*@files) {
my %norm;
my %norm;
do for @files -> $file {
do for @files -> $file {
%norm.push: $file X=> slurp($file).lc.words;
%norm.push: $file X=> slurp($file).lc.words;
Line 2,862: Line 2,862:
> ./indexsearch.rb It iS\!
> ./indexsearch.rb It iS\!
["file1", "file2", "file3"]</pre>
["file1", "file2", "file3"]</pre>

=={{header|Rust}}==
<lang Rust>// Part 1: Inverted index structure

use std::{
borrow::Borrow,
collections::{BTreeMap, BTreeSet},
};

#[derive(Debug, Default)]
pub struct InvertedIndex<T> {
indexed: BTreeMap<String, BTreeSet<usize>>,
sources: Vec<T>,
}

impl<T> InvertedIndex<T> {
pub fn add<I, V>(&mut self, source: T, tokens: I)
where
I: IntoIterator<Item = V>,
V: Into<String>,
{
let source_id = self.sources.len();
self.sources.push(source);
for token in tokens {
self.indexed
.entry(token.into())
.or_insert_with(BTreeSet::new)
.insert(source_id);
}
}

pub fn search<'a, I, K>(&self, tokens: I) -> impl Iterator<Item = &T>
where
String: Borrow<K>,
K: Ord + ?Sized + 'a,
I: IntoIterator<Item = &'a K>,
{
let mut tokens = tokens.into_iter();

tokens
.next()
.and_then(|token| self.indexed.get(token).cloned())
.and_then(|first| {
tokens.try_fold(first, |found, token| {
self.indexed
.get(token)
.map(|sources| {
found
.intersection(sources)
.cloned()
.collect::<BTreeSet<_>>()
})
.filter(|update| !update.is_empty())
})
})
.unwrap_or_else(BTreeSet::new)
.into_iter()
.map(move |source| &self.sources[source])
}

pub fn tokens(&self) -> impl Iterator<Item = &str> {
self.indexed.keys().map(|it| it.as_str())
}

pub fn sources(&self) -> &[T] {
&self.sources
}
}

// Part 2: File walking and processing

use std::{
ffi::OsString,
fmt::{Debug, Display},
fs::{read_dir, DirEntry, File, ReadDir},
io::{self, stdin, Read},
path::{Path, PathBuf},
};

#[derive(Debug)]
pub struct Files {
dirs: Vec<ReadDir>,
}

impl Files {
pub fn walk<P: AsRef<Path>>(path: P) -> io::Result<Self> {
Ok(Files {
dirs: vec![read_dir(path)?],
})
}
}

impl Iterator for Files {
type Item = DirEntry;

fn next(&mut self) -> Option<Self::Item> {
'outer: while let Some(mut current) = self.dirs.pop() {
while let Some(entry) = current.next() {
if let Ok(entry) = entry {
let path = entry.path();
if !path.is_dir() {
self.dirs.push(current);
return Some(entry);
} else if let Ok(dir) = read_dir(path) {
self.dirs.push(current);
self.dirs.push(dir);
continue 'outer;
}
}
}
}

None // No directory left
}
}

fn tokenize<'a>(input: &'a str) -> impl Iterator<Item = String> + 'a {
input
.split(|c: char| !c.is_alphanumeric())
.filter(|token| !token.is_empty())
.map(|token| token.to_lowercase())
}

fn tokenize_file<P: AsRef<Path>>(path: P) -> io::Result<BTreeSet<String>> {
let mut buffer = Vec::new();
File::open(path)?.read_to_end(&mut buffer)?;
let text = String::from_utf8_lossy(&buffer);
Ok(tokenize(&text).collect::<BTreeSet<_>>())
}

fn tokenize_query(input: &str) -> Vec<String> {
let result = tokenize(input).collect::<BTreeSet<_>>();
// Make a vector sorted by length, so that longer tokens are processed first.
// This heuristics should narrow the resulting set faster.
let mut result = result.into_iter().collect::<Vec<_>>();
result.sort_by_key(|item| usize::MAX - item.len());
result
}

// Part 3: Interactive application

fn args() -> io::Result<(OsString, BTreeSet<OsString>)> {
let mut args = std::env::args_os().skip(1); // Skip the executable's name

let path = args
.next()
.ok_or_else(|| io::Error::new(io::ErrorKind::Other, "missing path"))?;

let extensions = args.collect::<BTreeSet<_>>();

Ok((path, extensions))
}

fn print_hits<'a, T>(hits: impl Iterator<Item = T>)
where
T: Display,
{
let mut found_none = true;
for (number, hit) in hits.enumerate() {
println!(" [{}] {}", number + 1, hit);
found_none = false;
}

if found_none {
println!("(none)")
}
}

fn main() -> io::Result<()> {
let (path, extensions) = args()?;
let mut files = InvertedIndex::<PathBuf>::default();
let mut content = InvertedIndex::<PathBuf>::default();

println!(
"Indexing {:?} files in '{}'",
extensions,
path.to_string_lossy()
);

for path in Files::walk(path)?.map(|file| file.path()).filter(|path| {
path.extension()
.filter(|&ext| extensions.is_empty() || extensions.contains(ext))
.is_some()
}) {
files.add(path.clone(), tokenize(&path.to_string_lossy()));

match tokenize_file(&path) {
Ok(tokens) => content.add(path, tokens),
Err(e) => eprintln!("Skipping a file {}: {}", path.display(), e),
}
}

println!(
"Indexed {} tokens in {} files.",
content.tokens().count(),
content.sources.len()
);

// Run the query UI loop
let mut query = String::new();

loop {
query.clear();
println!("Enter search query:");
if stdin().read_line(&mut query).is_err() || query.trim().is_empty() {
break;
}

match query.trim() {
"/exit" | "/quit" | "" => break,

"/tokens" => {
println!("Tokens:");
for token in content.tokens() {
println!("{}", token);
}

println!();
}

"/files" => {
println!("Sources:");
for source in content.sources() {
println!("{}", source.display());
}

println!();
}

_ => {
let query = tokenize_query(&query);
println!();
println!("Found hits:");
print_hits(content.search(query.iter()).map(|it| it.display()));
println!("Found file names:");
print_hits(files.search(query.iter()).map(|it| it.display()));
println!();
}
}
}

Ok(())
}</lang>

{{out}}
<pre>C:\Temp>inverted_index books html htm txt
Indexing {"htm", "html", "txt"} files in 'books'
Indexed 34810 tokens in 9 files.
Enter search query:
war

Found hits:
[1] books\EN\C\Chaucer, Geoffrey - Canterbury Tales.txt
[2] books\EN\H\Homer - The Odyssey.txt
[3] books\EN\O\Orwell, George - 1984.html
[4] books\EN\W\Wells, Herbert George - The Invisible Man.txt
[5] books\EN\W\Wells, Herbert George - The Island of Doctor Moreau.txt
[6] books\EN\W\Wells, Herbert George - The Time Machine.txt
[7] books\EN\W\Wells, Herbert George - War of the Worlds.txt
Found file names:
[1] books\EN\W\Wells, Herbert George - War of the Worlds.txt

Enter search query:
war gun

Found hits:
[1] books\EN\O\Orwell, George - 1984.html
[2] books\EN\W\Wells, Herbert George - War of the Worlds.txt
Found file names:
(none)

Enter search query:
/exit

C:\Temp></pre>


=={{header|Scala}}==
=={{header|Scala}}==
Line 2,934: Line 3,209:
array set localidx {}
array set localidx {}
foreach word [wordsInString $data] {
foreach word [wordsInString $data] {
lappend localidx($word) $i
lappend localidx($word) $i
incr i
incr i
}
}


# Transcribe into global index
# Transcribe into global index
foreach {word places} [array get localidx] {
foreach {word places} [array get localidx] {
dict set index($word) $filename $places
dict set index($word) $filename $places
}
}
}
}
Line 2,948: Line 3,223:
global index
global index
if {[info exists index($word)]} {
if {[info exists index($word)]} {
return [dict keys $index($word)]
return [dict keys $index($word)]
}
}
}
}
Line 2,956: Line 3,231:
set files [findFilesForWord [lindex $words 0]]
set files [findFilesForWord [lindex $words 0]]
foreach w [lrange $words 1 end] {
foreach w [lrange $words 1 end] {
set wf [findFilesForWord $w]
set wf [findFilesForWord $w]
set newfiles {}
set newfiles {}
foreach f $files {
foreach f $files {
if {$f in $wf} {lappend newfiles $f}
if {$f in $wf} {lappend newfiles $f}
}
}
set files $newfiles
set files $newfiles
}
}
return $files
return $files
Line 2,971: Line 3,246:
set files {}
set files {}
foreach w $words {
foreach w $words {
if {![info exist index($w)]} {
if {![info exist index($w)]} {
return
return
}
}
}
}
dict for {file places} $index([lindex $words 0]) {
dict for {file places} $index([lindex $words 0]) {
if {$file in $files} continue
if {$file in $files} continue
foreach start $places {
foreach start $places {
set gotStart 1
set gotStart 1
foreach w [lrange $words 1 end] {
foreach w [lrange $words 1 end] {
incr start
incr start
set gotNext 0
set gotNext 0
foreach {f ps} $index($w) {
foreach {f ps} $index($w) {
if {$f ne $file} continue
if {$f ne $file} continue
foreach p $ps {
foreach p $ps {
if {$p == $start} {
if {$p == $start} {
set gotNext 1
set gotNext 1
break
break
}
}
}
}
if {$gotNext} break
if {$gotNext} break
}
}
if {!$gotNext} {
if {!$gotNext} {
set gotStart 0
set gotStart 0
break
break
}
}
}
}
if {$gotStart} {
if {$gotStart} {
lappend files $file
lappend files $file
break
break
}
}
}
}
}
}
return $files
return $files
Line 3,014: Line 3,289:
pack [entry .found.entry -textvariable terms] -fill x
pack [entry .found.entry -textvariable terms] -fill x
pack [button .found.findAll -command FindAll \
pack [button .found.findAll -command FindAll \
-text "Find File with All"] -side left
-text "Find File with All"] -side left
pack [button .found.findSeq -command FindSeq \
pack [button .found.findSeq -command FindSeq \
-text "Find File with Sequence"] -side right
-text "Find File with Sequence"] -side right


# The actions invoked by various GUI buttons
# The actions invoked by various GUI buttons
Line 3,023: Line 3,298:
set f [tk_getOpenFile]
set f [tk_getOpenFile]
if {$f ne ""} {
if {$f ne ""} {
addDocumentToIndex $f
addDocumentToIndex $f
lappend files $f
lappend files $f
}
}
}
}
Line 3,032: Line 3,307:
set fs [findFilesWithAllWords $words]
set fs [findFilesWithAllWords $words]
lappend found "Searching for files with all $terms" {*}$fs \
lappend found "Searching for files with all $terms" {*}$fs \
"---------------------"
"---------------------"
}
}
proc FindSeq {} {
proc FindSeq {} {
Line 3,039: Line 3,314:
set fs [findFilesWithWordSequence $words]
set fs [findFilesWithWordSequence $words]
lappend found "Searching for files with \"$terms\"" {*}$fs \
lappend found "Searching for files with \"$terms\"" {*}$fs \
"---------------------"
"---------------------"
}</lang>
}</lang>


Line 3,139: Line 3,414:
# the record separator for $INDEX/all.tab).
# the record separator for $INDEX/all.tab).
for file in "$@"; do
for file in "$@"; do
# Use printf(1), not echo, because "$file" might start with
# Use printf(1), not echo, because "$file" might start with
# a hyphen and become an option to echo.
# a hyphen and become an option to echo.
test 0 -eq $(printf %s "$file" | wc -l) || {
test 0 -eq $(printf %s "$file" | wc -l) || {
printf '%s\n' "$file: newline in filename" >&2
printf '%s\n' "$file: newline in filename" >&2
exit 1
exit 1
}
}
done
done


Line 3,153: Line 3,428:
fi=1
fi=1
for file in "$@"; do
for file in "$@"; do
printf %s "Indexing $file." >&2
printf %s "Indexing $file." >&2


# all.tab maps $fi => $file
# all.tab maps $fi => $file
echo "$fi $file" >> "$INDEX/all.tab"
echo "$fi $file" >> "$INDEX/all.tab"


# Use punctuation ([:punct:]) and whitespace (IFS)
# Use punctuation ([:punct:]) and whitespace (IFS)
# to split tokens.
# to split tokens.
ti=1
ti=1
tr -s '[:punct:]' ' ' < "$file" | while read line; do
tr -s '[:punct:]' ' ' < "$file" | while read line; do
for token in $line; do
for token in $line; do
# Index token by position ($fi, $ti). Ignore
# Index token by position ($fi, $ti). Ignore
# error from mkdir(1) if directory exists.
# error from mkdir(1) if directory exists.
mkdir "$INDEX/$token" 2>/dev/null
mkdir "$INDEX/$token" 2>/dev/null
echo $ti >> "$INDEX/$token/$fi"
echo $ti >> "$INDEX/$token/$fi"
: $((ti += 1))
: $((ti += 1))


# Show progress. Print a dot per 1000 tokens.
# Show progress. Print a dot per 1000 tokens.
case "$ti" in
case "$ti" in
*000) printf .
*000) printf .
esac
esac
done
done
done
done


echo >&2
echo >&2
: $((fi += 1))
: $((fi += 1))
done</lang>
done</lang>


Line 3,188: Line 3,463:
want=sequence
want=sequence
while getopts aos name; do
while getopts aos name; do
case "$name" in
case "$name" in
a) want=all;;
a) want=all;;
o) want=one;;
o) want=one;;
s) want=sequence;;
s) want=sequence;;
*) exit 2;;
*) exit 2;;
esac
esac
done
done
shift $((OPTIND - 1))
shift $((OPTIND - 1))


all() {
all() {
echo "TODO"
echo "TODO"
exit 2
exit 2
}
}


one() {
one() {
echo "TODO"
echo "TODO"
exit 2
exit 2
}
}


sequence() {
sequence() {
echo "TODO"
echo "TODO"
exit 2
exit 2
}
}