Inverted index: Difference between revisions
Content added Content deleted
Thundergnat (talk | contribs) (Rename Perl 6 -> Raku, alphabetize, minor clean-up) |
(Add source for Rust) |
||
Line 5: | Line 5: | ||
;Task: |
;Task: |
||
Given a set of text files, implement a program to create an inverted index. |
Given a set of text files, implement a program to create an inverted index. |
||
Also create a user interface to do a search using that inverted index which returns a list of files that contain the query term / terms. |
Also create a user interface to do a search using that inverted index which returns a list of files that contain the query term / terms. |
||
The search index can be in memory. |
The search index can be in memory. |
||
<br><br> |
<br><br> |
||
Line 138: | Line 138: | ||
Enter one or more words to search for; <return> to finish: |
Enter one or more words to search for; <return> to finish: |
||
it |
it |
||
Found in the following files: 0.txt, 1.txt, 2.txt |
Found in the following files: 0.txt, 1.txt, 2.txt |
||
Line 545: | Line 545: | ||
Loop, %files%, 0,1 |
Loop, %files%, 0,1 |
||
{ |
{ |
||
tooltip,%A_index% / 500 |
tooltip,%A_index% / 500 |
||
wordList := WordsIn(A_LoopFileFullPath) |
wordList := WordsIn(A_LoopFileFullPath) |
||
InvertedIndex(wordList, A_loopFileFullpath) |
InvertedIndex(wordList, A_loopFileFullpath) |
||
} |
} |
||
Line 566: | Line 566: | ||
WordsIn(docpath) |
WordsIn(docpath) |
||
{ |
{ |
||
FileRead, content, %docpath% |
FileRead, content, %docpath% |
||
spos = 1 |
spos = 1 |
||
Line 576: | Line 576: | ||
this_wordList .= match "`n" |
this_wordList .= match "`n" |
||
} |
} |
||
Sort, this_wordList, U |
Sort, this_wordList, U |
||
return this_wordList |
return this_wordList |
||
} |
} |
||
Line 585: | Line 585: | ||
global word2docs |
global word2docs |
||
loop, parse, words, `n,`r |
loop, parse, words, `n,`r |
||
{ |
|||
{ |
|||
if A_loopField = |
if A_loopField = |
||
continue |
continue |
||
Line 609: | Line 609: | ||
FileList$() = "BBCKEY0.TXT", "BBCKEY1.TXT", "BBCKEY2.TXT", \ |
FileList$() = "BBCKEY0.TXT", "BBCKEY1.TXT", "BBCKEY2.TXT", \ |
||
\ "BBCKEY3.TXT", "BBCKEY4.TXT" |
\ "BBCKEY3.TXT", "BBCKEY4.TXT" |
||
DictSize% = 30000 |
DictSize% = 30000 |
||
DIM Index{(DictSize%-1) word$, link%} |
DIM Index{(DictSize%-1) word$, link%} |
||
REM Build inverted index: |
REM Build inverted index: |
||
FOR file% = DIM(FileList$(),1) TO 0 STEP -1 |
FOR file% = DIM(FileList$(),1) TO 0 STEP -1 |
||
Line 618: | Line 618: | ||
F% = OPENIN(filename$) |
F% = OPENIN(filename$) |
||
IF F% = 0 ERROR 100, "Failed to open file" |
IF F% = 0 ERROR 100, "Failed to open file" |
||
WHILE NOT EOF#F% |
WHILE NOT EOF#F% |
||
REPEAT C%=BGET#F% : UNTIL C%>64 OR EOF#F% : word$ = CHR$(C%) |
REPEAT C%=BGET#F% : UNTIL C%>64 OR EOF#F% : word$ = CHR$(C%) |
||
REPEAT C%=BGET#F% : word$ += CHR$(C%) : UNTIL C%<65 |
REPEAT C%=BGET#F% : word$ += CHR$(C%) : UNTIL C%<65 |
||
word$ = FNlower(LEFT$(word$)) |
word$ = FNlower(LEFT$(word$)) |
||
hash% = FNhash(word$) |
hash% = FNhash(word$) |
||
WHILE Index{(hash%)}.word$<>"" AND Index{(hash%)}.word$<>word$ |
WHILE Index{(hash%)}.word$<>"" AND Index{(hash%)}.word$<>word$ |
||
Line 636: | Line 636: | ||
ENDIF |
ENDIF |
||
ENDWHILE |
ENDWHILE |
||
CLOSE #F% |
CLOSE #F% |
||
NEXT file% |
NEXT file% |
||
REM Now query the index: |
REM Now query the index: |
||
PRINT FNquery("random") |
PRINT FNquery("random") |
||
Line 646: | Line 646: | ||
PRINT FNquery("the") |
PRINT FNquery("the") |
||
END |
END |
||
DEF FNquery(A$) |
DEF FNquery(A$) |
||
LOCAL hash%, link%, temp% |
LOCAL hash%, link%, temp% |
||
Line 663: | Line 663: | ||
ENDWHILE |
ENDWHILE |
||
= LEFT$(LEFT$(A$)) |
= LEFT$(LEFT$(A$)) |
||
DEF FNhash(A$) |
DEF FNhash(A$) |
||
LOCAL hash% |
LOCAL hash% |
||
Line 670: | Line 670: | ||
IF LEN(A$) > 4 hash% EOR= !(!^A$ + LEN(A$) - 4) |
IF LEN(A$) > 4 hash% EOR= !(!^A$ + LEN(A$) - 4) |
||
= hash% MOD DictSize% |
= hash% MOD DictSize% |
||
DEF FNlower(A$) |
DEF FNlower(A$) |
||
LOCAL A%,C% |
LOCAL A%,C% |
||
Line 694: | Line 694: | ||
int chr_idx[256] = {0}; |
int chr_idx[256] = {0}; |
||
char idx_chr[256] = {0}; |
char idx_chr[256] = {0}; |
||
#define FNAME 0 |
#define FNAME 0 |
||
typedef struct trie_t *trie, trie_t; |
typedef struct trie_t *trie, trie_t; |
||
struct trie_t { |
struct trie_t { |
||
trie next[sizeof(chr_legal)]; /* next letter; slot 0 is for file name */ |
|||
int eow; |
|||
}; |
}; |
||
trie trie_new() { return calloc(sizeof(trie_t), 1); } |
trie trie_new() { return calloc(sizeof(trie_t), 1); } |
||
#define find_word(r, w) trie_trav(r, w, 1) |
#define find_word(r, w) trie_trav(r, w, 1) |
||
/* tree traversal: returns node if end of word and matches string, optionally |
/* tree traversal: returns node if end of word and matches string, optionally |
||
Line 710: | Line 710: | ||
trie trie_trav(trie root, const char * str, int no_create) |
trie trie_trav(trie root, const char * str, int no_create) |
||
{ |
{ |
||
int c; |
|||
while (root) { |
|||
if ((c = str[0]) == '\0') { |
|||
if (!root->eow && no_create) return 0; |
|||
break; |
|||
} |
|||
} |
|||
if (! (c = chr_idx[c]) ) { |
|||
str++; |
|||
continue; |
|||
} |
|||
} |
|||
if (!root->next[c]) { |
|||
if (no_create) return 0; |
|||
root->next[c] = trie_new(); |
|||
} |
|||
} |
|||
root = root->next[c]; |
|||
str++; |
|||
} |
|||
} |
|||
return root; |
|||
} |
} |
||
/* complete traversal of whole tree, calling callback at each end of word node. |
/* complete traversal of whole tree, calling callback at each end of word node. |
||
* similar method can be used to free nodes, had we wanted to do that. |
* similar method can be used to free nodes, had we wanted to do that. |
||
Line 736: | Line 736: | ||
int trie_all(trie root, char path[], int depth, int (*callback)(char *)) |
int trie_all(trie root, char path[], int depth, int (*callback)(char *)) |
||
{ |
{ |
||
int i; |
|||
if (root->eow && !callback(path)) return 0; |
|||
for (i = 1; i < sizeof(chr_legal); i++) { |
|||
if (!root->next[i]) continue; |
|||
path[depth] = idx_chr[i]; |
|||
path[depth + 1] = '\0'; |
|||
if (!trie_all(root->next[i], path, depth + 1, callback)) |
|||
return 0; |
|||
} |
|||
} |
|||
return 1; |
|||
} |
} |
||
void add_index(trie root, const char *word, const char *fname) |
void add_index(trie root, const char *word, const char *fname) |
||
{ |
{ |
||
trie x = trie_trav(root, word, 0); |
|||
x->eow = 1; |
|||
if (!x->next[FNAME]) |
|||
x->next[FNAME] = trie_new(); |
|||
x = trie_trav(x->next[FNAME], fname, 0); |
|||
x->eow = 1; |
|||
} |
} |
||
int print_path(char *path) |
int print_path(char *path) |
||
{ |
{ |
||
printf(" %s", path); |
|||
return 1; |
|||
} |
} |
||
/* pretend we parsed text files and got lower cased words: dealing * |
/* pretend we parsed text files and got lower cased words: dealing * |
||
* with text file is a whole other animal and would make code too long */ |
* with text file is a whole other animal and would make code too long */ |
||
const char *files[] = { "f1.txt", "source/f2.txt", "other_file" }; |
const char *files[] = { "f1.txt", "source/f2.txt", "other_file" }; |
||
const char *text[][5] ={{ "it", "is", "what", "it", "is" }, |
const char *text[][5] ={{ "it", "is", "what", "it", "is" }, |
||
{ "what", "is", "it", 0 }, |
|||
{ "it", "is", "a", "banana", 0 }}; |
|||
trie init_tables() |
trie init_tables() |
||
{ |
{ |
||
int i, j; |
|||
trie root = trie_new(); |
|||
for (i = 0; i < sizeof(chr_legal); i++) { |
|||
chr_idx[(int)chr_legal[i]] = i + 1; |
|||
idx_chr[i + 1] = chr_legal[i]; |
|||
} |
|||
} |
|||
/* Enable USE_ADVANCED_FILE_HANDLING to use advanced file handling. |
/* Enable USE_ADVANCED_FILE_HANDLING to use advanced file handling. |
||
* You need to have files named like above files[], with words in them |
* You need to have files named like above files[], with words in them |
||
* like in text[][]. Case doesn't matter (told you it's advanced). |
* like in text[][]. Case doesn't matter (told you it's advanced). |
||
*/ |
*/ |
||
#define USE_ADVANCED_FILE_HANDLING 0 |
#define USE_ADVANCED_FILE_HANDLING 0 |
||
#if USE_ADVANCED_FILE_HANDLING |
#if USE_ADVANCED_FILE_HANDLING |
||
void read_file(const char * fname) { |
|||
char cmd[1024]; |
|||
char word[1024]; |
|||
sprintf(cmd, "perl -p -e 'while(/(\\w+)/g) {print lc($1),\"\\n\"}' %s", fname); |
|||
FILE *in = popen(cmd, "r"); |
|||
while (!feof(in)) { |
|||
fscanf(in, "%1000s", word); |
|||
add_index(root, word, fname); |
|||
} |
|||
} |
|||
pclose(in); |
|||
}; |
|||
read_file("f1.txt"); |
|||
read_file("source/f2.txt"); |
|||
read_file("other_file"); |
|||
#else |
#else |
||
for (i = 0; i < 3; i++) { |
|||
for (j = 0; j < 5; j++) { |
|||
if (!text[i][j]) break; |
|||
add_index(root, text[i][j], files[i]); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
#endif /*USE_ADVANCED_FILE_HANDLING*/ |
#endif /*USE_ADVANCED_FILE_HANDLING*/ |
||
return root; |
|||
} |
} |
||
void search_index(trie root, const char *word) |
void search_index(trie root, const char *word) |
||
{ |
{ |
||
char path[1024]; |
|||
printf("Search for \"%s\": ", word); |
|||
trie found = find_word(root, word); |
|||
if (!found) printf("not found\n"); |
|||
else { |
|||
trie_all(found->next[FNAME], path, 0, print_path); |
|||
printf("\n"); |
|||
} |
|||
} |
|||
} |
} |
||
int main() |
int main() |
||
{ |
{ |
||
trie root = init_tables(); |
|||
search_index(root, "what"); |
|||
search_index(root, "is"); |
|||
search_index(root, "banana"); |
|||
search_index(root, "boo"); |
|||
return 0; |
|||
}</lang>Output:<lang>Search for "what": f1.txt source/f2.txt |
}</lang>Output:<lang>Search for "what": f1.txt source/f2.txt |
||
Search for "is": f1.txt other_file source/f2.txt |
Search for "is": f1.txt other_file source/f2.txt |
||
Line 889: | Line 889: | ||
{ |
{ |
||
public: |
public: |
||
node() { clear(); } |
node() { clear(); } |
||
node( char z ) { clear(); } |
node( char z ) { clear(); } |
||
~node() { for( int x = 0; x < MAX_NODES; x++ ) if( next[x] ) delete next[x]; } |
~node() { for( int x = 0; x < MAX_NODES; x++ ) if( next[x] ) delete next[x]; } |
||
Line 933: | Line 933: | ||
const std::vector<std::string>& find( std::string s ) { |
const std::vector<std::string>& find( std::string s ) { |
||
size_t idx; |
size_t idx; |
||
std::transform( s.begin(), s.end(), s.begin(), tolower ); |
std::transform( s.begin(), s.end(), s.begin(), tolower ); |
||
node* rt = &root; |
node* rt = &root; |
||
for( std::string::iterator i = s.begin(); i != s.end(); i++ ) { |
for( std::string::iterator i = s.begin(); i != s.end(); i++ ) { |
||
idx = _CHARS.find( *i ); |
idx = _CHARS.find( *i ); |
||
if( idx < MAX_NODES ) { |
if( idx < MAX_NODES ) { |
||
if( !rt->next[idx] ) return std::vector<std::string>(); |
if( !rt->next[idx] ) return std::vector<std::string>(); |
||
rt = rt->next[idx]; |
rt = rt->next[idx]; |
||
} |
} |
||
} |
} |
||
if( rt->isWord ) return rt->files; |
if( rt->isWord ) return rt->files; |
||
return std::vector<std::string>(); |
return std::vector<std::string>(); |
||
Line 951: | Line 951: | ||
idx = _CHARS.find( *i ); |
idx = _CHARS.find( *i ); |
||
if( idx < MAX_NODES ) { |
if( idx < MAX_NODES ) { |
||
n = rt->next[idx]; |
n = rt->next[idx]; |
||
if( n ){ |
if( n ){ |
||
rt = n; |
rt = n; |
||
continue; |
continue; |
||
} |
} |
||
n = new node( *i ); |
n = new node( *i ); |
||
rt->next[idx] = n; |
rt->next[idx] = n; |
||
rt = n; |
rt = n; |
||
} |
} |
||
} |
} |
||
Line 983: | Line 983: | ||
} |
} |
||
} |
} |
||
while( true ) { |
while( true ) { |
||
std::cout << "Enter one word to search for, return to exit: "; |
std::cout << "Enter one word to search for, return to exit: "; |
||
Line 1,023: | Line 1,023: | ||
(defn term-seq [text] (map normalize (re-seq pattern text))) |
(defn term-seq [text] (map normalize (re-seq pattern text))) |
||
(defn set-assoc |
(defn set-assoc |
||
"Produces map with v added to the set associated with key k in map m" |
"Produces map with v added to the set associated with key k in map m" |
||
[m k v] (assoc m k (conj (get m k #{}) v))) |
[m k v] (assoc m k (conj (get m k #{}) v))) |
||
Line 1,064: | Line 1,064: | ||
console.log "#{fn}:#{line_num}" |
console.log "#{fn}:#{line_num}" |
||
console.log "\n" |
console.log "\n" |
||
get_words = (line) -> |
get_words = (line) -> |
||
words = line.replace(/\W/g, ' ').split ' ' |
words = line.replace(/\W/g, ' ').split ' ' |
||
Line 1,081: | Line 1,081: | ||
output |
output |
||
<lang> |
<lang> |
||
> coffee inverted_index.coffee |
> coffee inverted_index.coffee |
||
locations for 'make_index': |
locations for 'make_index': |
||
inverted_index.coffee:3 |
inverted_index.coffee:3 |
||
Line 1,211: | Line 1,211: | ||
;; get text for each file, and call (action filename text) |
;; get text for each file, and call (action filename text) |
||
(define (map-files action files) |
(define (map-files action files) |
||
(for ((file files)) |
|||
(file->string action file))) |
|||
;; create store |
;; create store |
||
(local-make-store INVERT) |
(local-make-store INVERT) |
||
; invert-word : word -> set of files |
; invert-word : word -> set of files |
||
(define (invert-word word file store) |
(define (invert-word word file store) |
||
(local-put-value word |
|||
(make-set (cons file (local-get-value word store))) store)) |
|||
; parse file text and invert each word |
; parse file text and invert each word |
||
(define (invert-text file text) |
(define (invert-text file text) |
||
(writeln 'Inverting file text) |
|||
(let ((text (text-parse text))) |
|||
(for ((word text)) (invert-word (string-downcase word) file INVERT)))) |
|||
</lang> |
</lang> |
||
Line 1,233: | Line 1,233: | ||
Intersect sets values of each word. |
Intersect sets values of each word. |
||
<lang lisp> |
<lang lisp> |
||
;; usage : (inverted-search w1 w2 ..) |
;; usage : (inverted-search w1 w2 ..) |
||
(define-syntax-rule (inverted-search w ...) |
(define-syntax-rule (inverted-search w ...) |
||
(and-get-invert (quote w ))) |
|||
;; intersects all sets referenced by words |
;; intersects all sets referenced by words |
||
;; returns the intersection |
;; returns the intersection |
||
(define (and-get-invert words) |
(define (and-get-invert words) |
||
(foldl |
|||
(lambda(word res) |
|||
(set-intersect res (local-get-value word INVERT))) |
|||
FILES words)) |
|||
</lang> |
</lang> |
||
Output : |
Output : |
||
Line 1,271: | Line 1,271: | ||
lists:foldl( fun import_from_file/2, dict:new(), Files ). |
lists:foldl( fun import_from_file/2, dict:new(), Files ). |
||
search( |
search( Binaries, Inverted_index ) -> |
||
[Files | T] = [dict:fetch(X, Inverted_index) || X <- Binaries], |
[Files | T] = [dict:fetch(X, Inverted_index) || X <- Binaries], |
||
lists:foldl( fun search_common/2, Files, T ). |
lists:foldl( fun search_common/2, Files, T ). |
||
Line 1,287: | Line 1,287: | ||
import_from_file( File, Dict_acc ) -> |
import_from_file( File, Dict_acc ) -> |
||
New_dict = dict:from_list( import_from_file_contents(File, file:read_file(File)) ), |
New_dict = dict:from_list( import_from_file_contents(File, file:read_file(File)) ), |
||
dict:merge( fun import_from_file_merge/3, Dict_acc, New_dict ). |
|||
import_from_file_contents( File, {ok, Binary} ) -> |
import_from_file_contents( File, {ok, Binary} ) -> |
||
[{X, [File]} || X <- binary:split( Binary, binary:compile_pattern([<<" ">>, <<"\n">>]), [global] )]; |
[{X, [File]} || X <- binary:split( Binary, binary:compile_pattern([<<" ">>, <<"\n">>]), [global] )]; |
||
import_from_file_contents( File, {error, Error} ) -> |
import_from_file_contents( File, {error, Error} ) -> |
||
io:fwrite( "Error: could not open file ~p: ~p~nContinuing with the rest of them~n", [File, Error] ), |
|||
[]. |
|||
import_from_file_merge( |
import_from_file_merge( _Key, Files, [New_file] ) -> [New_file | Files]. |
||
search_common( Files, Acc ) -> [X || X <- Acc, lists:member(X, Files)]. |
search_common( Files, Acc ) -> [X || X <- Acc, lists:member(X, Files)]. |
||
Line 1,306: | Line 1,306: | ||
// Map search terms to associated set of files |
// Map search terms to associated set of files |
||
type searchIndexMap = Map<string, Set<string>> |
type searchIndexMap = Map<string, Set<string>> |
||
let inputSearchCriteria() = |
let inputSearchCriteria() = |
||
let readLine prompt = |
let readLine prompt = |
||
printf "%s: " prompt |
printf "%s: " prompt |
||
Console.ReadLine().Split() |
Console.ReadLine().Split() |
||
readLine "Files", (readLine "Find") |> Array.map (fun s -> s.ToLower()) |
readLine "Files", (readLine "Find") |> Array.map (fun s -> s.ToLower()) |
||
let updateIndex indexMap keyValuePair = |
let updateIndex indexMap keyValuePair = |
||
let k, v = keyValuePair |
let k, v = keyValuePair |
||
match Map.tryFind k indexMap with |
match Map.tryFind k indexMap with |
||
| None -> Map.add k (Set.singleton v) indexMap |
| None -> Map.add k (Set.singleton v) indexMap |
||
| Some set -> Map.add k (Set.add v set) indexMap |
| Some set -> Map.add k (Set.add v set) indexMap |
||
let buildIndex files = |
let buildIndex files = |
||
let fileData file = |
let fileData file = |
||
File.ReadAllText(file).Split() |> Seq.map (fun word -> word.ToLower(), file) |
File.ReadAllText(file).Split() |> Seq.map (fun word -> word.ToLower(), file) |
||
files |> Seq.collect fileData |
files |> Seq.collect fileData |
||
|> Seq.fold updateIndex Map.empty |
|> Seq.fold updateIndex Map.empty |
||
let searchFiles() = |
let searchFiles() = |
||
let files, terms = inputSearchCriteria() |
let files, terms = inputSearchCriteria() |
||
Line 1,481: | Line 1,481: | ||
} |
} |
||
return true |
return true |
||
} |
} |
||
func ui() { |
func ui() { |
||
fmt.Println(len(index), "words indexed in", len(indexed), "files") |
fmt.Println(len(index), "words indexed in", len(indexed), "files") |
||
Line 1,500: | Line 1,500: | ||
fmt.Println("one match:") |
fmt.Println("one match:") |
||
fmt.Println(" ", indexed[dl[0]].file, indexed[dl[0]].title) |
fmt.Println(" ", indexed[dl[0]].file, indexed[dl[0]].title) |
||
default: |
default: |
||
fmt.Println(len(dl), "matches:") |
fmt.Println(len(dl), "matches:") |
||
for _, d := range dl { |
for _, d := range dl { |
||
Line 1,580: | Line 1,580: | ||
every textname := key(texts) do # build index for each 'text' |
every textname := key(texts) do # build index for each 'text' |
||
SII := InvertedIndex(SII,textname,texts[textname]) |
SII := InvertedIndex(SII,textname,texts[textname]) |
||
TermSearchUI(SII) # search UI |
TermSearchUI(SII) # search UI |
||
end |
end |
||
Line 1,601: | Line 1,601: | ||
repeat { |
repeat { |
||
writes("Enter search terms (^z to quit) : ") |
writes("Enter search terms (^z to quit) : ") |
||
terms := map(trim(read() | break)) |
terms := map(trim(read() | break)) |
||
x := [] |
x := [] |
||
terms ? while not pos(0) do { |
terms ? while not pos(0) do { |
||
tab(many(' \t')) |
tab(many(' \t')) |
||
put(x,tab(upto('\ \t')|0)) |
put(x,tab(upto('\ \t')|0)) |
||
} |
} |
||
show("Searching for : ",x) |
show("Searching for : ",x) |
||
show("Found in : ",s := TermSearch(ii,x)) | show("Not found : ",x) |
show("Found in : ",s := TermSearch(ii,x)) | show("Not found : ",x) |
||
} |
} |
||
write("End of search") |
write("End of search") |
||
Line 1,617: | Line 1,617: | ||
procedure TermSearch(ii,x) #: return set of matches or fail |
procedure TermSearch(ii,x) #: return set of matches or fail |
||
every s := !x do |
every s := !x do |
||
( /u := ii[s] ) | (u **:= ii[s]) |
( /u := ii[s] ) | (u **:= ii[s]) |
||
if *u > 0 then return u |
if *u > 0 then return u |
||
Line 1,625: | Line 1,625: | ||
every writes(s|!x) do writes(" ") |
every writes(s|!x) do writes(" ") |
||
write() |
write() |
||
return |
return |
||
end</lang> |
end</lang> |
||
Line 1,722: | Line 1,722: | ||
public class InvertedIndex { |
public class InvertedIndex { |
||
List<String> stopwords = Arrays.asList("a", "able", "about", |
|||
"across", "after", "all", "almost", "also", "am", "among", "an", |
|||
"and", "any", "are", "as", "at", "be", "because", "been", "but", |
|||
"by", "can", "cannot", "could", "dear", "did", "do", "does", |
|||
"either", "else", "ever", "every", "for", "from", "get", "got", |
|||
"had", "has", "have", "he", "her", "hers", "him", "his", "how", |
|||
"however", "i", "if", "in", "into", "is", "it", "its", "just", |
|||
"least", "let", "like", "likely", "may", "me", "might", "most", |
|||
"must", "my", "neither", "no", "nor", "not", "of", "off", "often", |
|||
"on", "only", "or", "other", "our", "own", "rather", "said", "say", |
|||
"says", "she", "should", "since", "so", "some", "than", "that", |
|||
"the", "their", "them", "then", "there", "these", "they", "this", |
|||
"tis", "to", "too", "twas", "us", "wants", "was", "we", "were", |
|||
"what", "when", "where", "which", "while", "who", "whom", "why", |
|||
"will", "with", "would", "yet", "you", "your"); |
|||
Map<String, List<Tuple>> index = new HashMap<String, List<Tuple>>(); |
|||
List<String> files = new ArrayList<String>(); |
|||
public void indexFile(File file) throws IOException { |
|||
int fileno = files.indexOf(file.getPath()); |
|||
if (fileno == -1) { |
|||
files.add(file.getPath()); |
|||
fileno = files.size() - 1; |
|||
} |
|||
} |
|||
int pos = 0; |
|||
BufferedReader reader = new BufferedReader(new FileReader(file)); |
|||
for (String line = reader.readLine(); line != null; line = reader |
|||
.readLine()) { |
|||
for (String _word : line.split("\\W+")) { |
|||
String word = _word.toLowerCase(); |
|||
pos++; |
|||
if (stopwords.contains(word)) |
|||
continue; |
|||
List<Tuple> idx = index.get(word); |
|||
if (idx == null) { |
|||
idx = new LinkedList<Tuple>(); |
|||
index.put(word, idx); |
|||
} |
|||
} |
|||
idx.add(new Tuple(fileno, pos)); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
System.out.println("indexed " + file.getPath() + " " + pos + " words"); |
|||
} |
|||
} |
|||
public void search(List<String> words) { |
|||
for (String _word : words) { |
|||
Set<String> answer = new HashSet<String>(); |
|||
String word = _word.toLowerCase(); |
|||
List<Tuple> idx = index.get(word); |
|||
if (idx != null) { |
|||
for (Tuple t : idx) { |
|||
answer.add(files.get(t.fileno)); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
System.out.print(word); |
|||
for (String f : answer) { |
|||
System.out.print(" " + f); |
|||
} |
|||
} |
|||
System.out.println(""); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
public static void main(String[] args) { |
|||
try { |
|||
InvertedIndex idx = new InvertedIndex(); |
|||
for (int i = 1; i < args.length; i++) { |
|||
idx.indexFile(new File(args[i])); |
|||
} |
|||
} |
|||
idx.search(Arrays.asList(args[0].split(","))); |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
private class Tuple { |
|||
private int fileno; |
|||
private int position; |
|||
public Tuple(int fileno, int position) { |
|||
this.fileno = fileno; |
|||
this.position = position; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
} |
||
Line 1,813: | Line 1,813: | ||
Example output: |
Example output: |
||
<lang Java> |
<lang Java> |
||
java -cp bin org.rosettacode.InvertedIndex "huntsman,merit,dog,the,gutenberg,lovecraft,olympian" pg30637.txt pg7025.txt pg82.txt pg9090.txt |
java -cp bin org.rosettacode.InvertedIndex "huntsman,merit,dog,the,gutenberg,lovecraft,olympian" pg30637.txt pg7025.txt pg82.txt pg9090.txt |
||
indexed pg30637.txt 106473 words |
indexed pg30637.txt 106473 words |
||
indexed pg7025.txt 205714 words |
indexed pg7025.txt 205714 words |
||
Line 1,848: | Line 1,848: | ||
def overlap(that): . as $this |
def overlap(that): . as $this |
||
| reduce that[] as $item ([]; if $this|index($item) then . + [$item] else . end); |
| reduce that[] as $item ([]; if $this|index($item) then . + [$item] else . end); |
||
. as $dict |
. as $dict |
||
| if (words|length) == 0 then [] |
| if (words|length) == 0 then [] |
||
Line 1,864: | Line 1,864: | ||
<lang jq>def prompt_search: |
<lang jq>def prompt_search: |
||
"Enter a string or an array of strings to search for, quoting each string, or 0 to exit:", |
"Enter a string or an array of strings to search for, quoting each string, or 0 to exit:", |
||
( (input | if type == "array" then . elif type == "string" then [.] |
( (input | if type == "array" then . elif type == "string" then [.] |
||
else empty |
else empty |
||
end) as $in |
end) as $in |
||
Line 1,997: | Line 1,997: | ||
Enter word(s) to be searched for in these files or 'q' to quit |
Enter word(s) to be searched for in these files or 'q' to quit |
||
? : cat |
? : cat |
||
'cat' not found |
'cat' not found |
||
Line 2,046: | Line 2,046: | ||
unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma \ |
unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma \ |
||
inv.ml |
inv.ml |
||
ocamlc -o inv.byte unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma inv.cmo |
ocamlc -o inv.byte unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma inv.cmo |
||
Line 2,143: | Line 2,143: | ||
foreach my $file (@files) |
foreach my $file (@files) |
||
{ |
{ |
||
open(F, "<", $file) or die "Can't read file $file: $!"; |
|||
while(<F>) { |
|||
s/\A\W+//; |
s/\A\W+//; |
||
foreach my $w (map {lc} grep {length() >= 3} split /\W+/) |
|||
{ |
|||
if ( exists($iindex{$w}) ) |
|||
{ |
|||
{ |
|||
$iindex{$w}->insert($file); |
|||
} else { |
|||
$iindex{$w} = set($file); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
|||
close(F); |
|||
} |
} |
||
return %iindex; |
return %iindex; |
||
Line 2,167: | Line 2,167: | ||
my @words = @_; |
my @words = @_; |
||
my $res = set(); |
my $res = set(); |
||
foreach my $w (map {lc} @words) |
foreach my $w (map {lc} @words) |
||
{ |
{ |
||
$w =~ s/\W+//g; # strip non-words chars |
|||
length $w < 3 and next; |
length $w < 3 and next; |
||
exists $idx{$w} or return set(); |
exists $idx{$w} or return set(); |
||
Line 2,189: | Line 2,189: | ||
=={{header|Phix}}== |
=={{header|Phix}}== |
||
The following is included in the distro as demo\rosetta\Inverted_index.exw.<br> |
The following is included in the distro as demo\rosetta\Inverted_index.exw.<br> |
||
Loads all text files in demo\rosetta\ and builds a list of filenames and |
Loads all text files in demo\rosetta\ and builds a list of filenames and |
||
a dictionary of {word,file_indexes}, before a handful of quick tests.<br> |
a dictionary of {word,file_indexes}, before a handful of quick tests.<br> |
||
Might be better (and almost as easy) for the dictionary values to be say |
Might be better (and almost as easy) for the dictionary values to be say |
||
Line 2,376: | Line 2,376: | ||
# Create index hashtable, as needed |
# Create index hashtable, as needed |
||
If ( -not $Script:WordIndex ) { $Script:WordIndex = @{} } |
If ( -not $Script:WordIndex ) { $Script:WordIndex = @{} } |
||
# For each file to be indexed... |
# For each file to be indexed... |
||
ForEach ( $File in $FileList ) |
ForEach ( $File in $FileList ) |
||
Line 2,382: | Line 2,382: | ||
# Find any previously existing entries for this file |
# Find any previously existing entries for this file |
||
$ExistingEntries = $Script:WordIndex.Keys | Where { $Script:WordIndex[$_] -contains $File } |
$ExistingEntries = $Script:WordIndex.Keys | Where { $Script:WordIndex[$_] -contains $File } |
||
# For any previously existing entries |
# For any previously existing entries |
||
# Delete them (prior to reindexing the file) |
# Delete them (prior to reindexing the file) |
||
Line 2,389: | Line 2,389: | ||
$Script:WordIndex[$Key] = @( $Script:WordIndex[$Key] | Where { $_ -ne $File } ) |
$Script:WordIndex[$Key] = @( $Script:WordIndex[$Key] | Where { $_ -ne $File } ) |
||
} |
} |
||
# Get the contents of the file, split on non-alphanumeric characters, and remove duplicates |
# Get the contents of the file, split on non-alphanumeric characters, and remove duplicates |
||
$Words = ( Get-Content $File ) -split '[^a-zA-Z\d]' | Sort -Unique |
$Words = ( Get-Content $File ) -split '[^a-zA-Z\d]' | Sort -Unique |
||
# For each word in the file... |
# For each word in the file... |
||
ForEach ( $Word in $Words ) |
ForEach ( $Word in $Words ) |
||
Line 2,410: | Line 2,410: | ||
} |
} |
||
} |
} |
||
function Find-Word ( [string]$Word ) |
function Find-Word ( [string]$Word ) |
||
{ |
{ |
||
Line 2,420: | Line 2,420: | ||
various words. |
various words. |
||
'@ | Out-File -FilePath C:\Test\File1.txt |
'@ | Out-File -FilePath C:\Test\File1.txt |
||
@' |
@' |
||
Create an index |
Create an index |
||
of words. |
of words. |
||
'@ | Out-File -FilePath C:\Test\File2.txt |
'@ | Out-File -FilePath C:\Test\File2.txt |
||
@' |
@' |
||
Use the index |
Use the index |
||
Line 2,628: | Line 2,628: | ||
{{works with|rakudo|2015-09-16}} |
{{works with|rakudo|2015-09-16}} |
||
<lang perl6>sub MAIN (*@files) { |
<lang perl6>sub MAIN (*@files) { |
||
my %norm; |
my %norm; |
||
do for @files -> $file { |
do for @files -> $file { |
||
%norm.push: $file X=> slurp($file).lc.words; |
%norm.push: $file X=> slurp($file).lc.words; |
||
Line 2,862: | Line 2,862: | ||
> ./indexsearch.rb It iS\! |
> ./indexsearch.rb It iS\! |
||
["file1", "file2", "file3"]</pre> |
["file1", "file2", "file3"]</pre> |
||
=={{header|Rust}}== |
|||
<lang Rust>// Part 1: Inverted index structure |
|||
use std::{ |
|||
borrow::Borrow, |
|||
collections::{BTreeMap, BTreeSet}, |
|||
}; |
|||
#[derive(Debug, Default)] |
|||
pub struct InvertedIndex<T> { |
|||
indexed: BTreeMap<String, BTreeSet<usize>>, |
|||
sources: Vec<T>, |
|||
} |
|||
impl<T> InvertedIndex<T> { |
|||
pub fn add<I, V>(&mut self, source: T, tokens: I) |
|||
where |
|||
I: IntoIterator<Item = V>, |
|||
V: Into<String>, |
|||
{ |
|||
let source_id = self.sources.len(); |
|||
self.sources.push(source); |
|||
for token in tokens { |
|||
self.indexed |
|||
.entry(token.into()) |
|||
.or_insert_with(BTreeSet::new) |
|||
.insert(source_id); |
|||
} |
|||
} |
|||
pub fn search<'a, I, K>(&self, tokens: I) -> impl Iterator<Item = &T> |
|||
where |
|||
String: Borrow<K>, |
|||
K: Ord + ?Sized + 'a, |
|||
I: IntoIterator<Item = &'a K>, |
|||
{ |
|||
let mut tokens = tokens.into_iter(); |
|||
tokens |
|||
.next() |
|||
.and_then(|token| self.indexed.get(token).cloned()) |
|||
.and_then(|first| { |
|||
tokens.try_fold(first, |found, token| { |
|||
self.indexed |
|||
.get(token) |
|||
.map(|sources| { |
|||
found |
|||
.intersection(sources) |
|||
.cloned() |
|||
.collect::<BTreeSet<_>>() |
|||
}) |
|||
.filter(|update| !update.is_empty()) |
|||
}) |
|||
}) |
|||
.unwrap_or_else(BTreeSet::new) |
|||
.into_iter() |
|||
.map(move |source| &self.sources[source]) |
|||
} |
|||
pub fn tokens(&self) -> impl Iterator<Item = &str> { |
|||
self.indexed.keys().map(|it| it.as_str()) |
|||
} |
|||
pub fn sources(&self) -> &[T] { |
|||
&self.sources |
|||
} |
|||
} |
|||
// Part 2: File walking and processing |
|||
use std::{ |
|||
ffi::OsString, |
|||
fmt::{Debug, Display}, |
|||
fs::{read_dir, DirEntry, File, ReadDir}, |
|||
io::{self, stdin, Read}, |
|||
path::{Path, PathBuf}, |
|||
}; |
|||
#[derive(Debug)] |
|||
pub struct Files { |
|||
dirs: Vec<ReadDir>, |
|||
} |
|||
impl Files { |
|||
pub fn walk<P: AsRef<Path>>(path: P) -> io::Result<Self> { |
|||
Ok(Files { |
|||
dirs: vec![read_dir(path)?], |
|||
}) |
|||
} |
|||
} |
|||
impl Iterator for Files { |
|||
type Item = DirEntry; |
|||
fn next(&mut self) -> Option<Self::Item> { |
|||
'outer: while let Some(mut current) = self.dirs.pop() { |
|||
while let Some(entry) = current.next() { |
|||
if let Ok(entry) = entry { |
|||
let path = entry.path(); |
|||
if !path.is_dir() { |
|||
self.dirs.push(current); |
|||
return Some(entry); |
|||
} else if let Ok(dir) = read_dir(path) { |
|||
self.dirs.push(current); |
|||
self.dirs.push(dir); |
|||
continue 'outer; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
None // No directory left |
|||
} |
|||
} |
|||
fn tokenize<'a>(input: &'a str) -> impl Iterator<Item = String> + 'a { |
|||
input |
|||
.split(|c: char| !c.is_alphanumeric()) |
|||
.filter(|token| !token.is_empty()) |
|||
.map(|token| token.to_lowercase()) |
|||
} |
|||
fn tokenize_file<P: AsRef<Path>>(path: P) -> io::Result<BTreeSet<String>> { |
|||
let mut buffer = Vec::new(); |
|||
File::open(path)?.read_to_end(&mut buffer)?; |
|||
let text = String::from_utf8_lossy(&buffer); |
|||
Ok(tokenize(&text).collect::<BTreeSet<_>>()) |
|||
} |
|||
fn tokenize_query(input: &str) -> Vec<String> { |
|||
let result = tokenize(input).collect::<BTreeSet<_>>(); |
|||
// Make a vector sorted by length, so that longer tokens are processed first. |
|||
// This heuristics should narrow the resulting set faster. |
|||
let mut result = result.into_iter().collect::<Vec<_>>(); |
|||
result.sort_by_key(|item| usize::MAX - item.len()); |
|||
result |
|||
} |
|||
// Part 3: Interactive application |
|||
fn args() -> io::Result<(OsString, BTreeSet<OsString>)> { |
|||
let mut args = std::env::args_os().skip(1); // Skip the executable's name |
|||
let path = args |
|||
.next() |
|||
.ok_or_else(|| io::Error::new(io::ErrorKind::Other, "missing path"))?; |
|||
let extensions = args.collect::<BTreeSet<_>>(); |
|||
Ok((path, extensions)) |
|||
} |
|||
fn print_hits<'a, T>(hits: impl Iterator<Item = T>) |
|||
where |
|||
T: Display, |
|||
{ |
|||
let mut found_none = true; |
|||
for (number, hit) in hits.enumerate() { |
|||
println!(" [{}] {}", number + 1, hit); |
|||
found_none = false; |
|||
} |
|||
if found_none { |
|||
println!("(none)") |
|||
} |
|||
} |
|||
fn main() -> io::Result<()> { |
|||
let (path, extensions) = args()?; |
|||
let mut files = InvertedIndex::<PathBuf>::default(); |
|||
let mut content = InvertedIndex::<PathBuf>::default(); |
|||
println!( |
|||
"Indexing {:?} files in '{}'", |
|||
extensions, |
|||
path.to_string_lossy() |
|||
); |
|||
for path in Files::walk(path)?.map(|file| file.path()).filter(|path| { |
|||
path.extension() |
|||
.filter(|&ext| extensions.is_empty() || extensions.contains(ext)) |
|||
.is_some() |
|||
}) { |
|||
files.add(path.clone(), tokenize(&path.to_string_lossy())); |
|||
match tokenize_file(&path) { |
|||
Ok(tokens) => content.add(path, tokens), |
|||
Err(e) => eprintln!("Skipping a file {}: {}", path.display(), e), |
|||
} |
|||
} |
|||
println!( |
|||
"Indexed {} tokens in {} files.", |
|||
content.tokens().count(), |
|||
content.sources.len() |
|||
); |
|||
// Run the query UI loop |
|||
let mut query = String::new(); |
|||
loop { |
|||
query.clear(); |
|||
println!("Enter search query:"); |
|||
if stdin().read_line(&mut query).is_err() || query.trim().is_empty() { |
|||
break; |
|||
} |
|||
match query.trim() { |
|||
"/exit" | "/quit" | "" => break, |
|||
"/tokens" => { |
|||
println!("Tokens:"); |
|||
for token in content.tokens() { |
|||
println!("{}", token); |
|||
} |
|||
println!(); |
|||
} |
|||
"/files" => { |
|||
println!("Sources:"); |
|||
for source in content.sources() { |
|||
println!("{}", source.display()); |
|||
} |
|||
println!(); |
|||
} |
|||
_ => { |
|||
let query = tokenize_query(&query); |
|||
println!(); |
|||
println!("Found hits:"); |
|||
print_hits(content.search(query.iter()).map(|it| it.display())); |
|||
println!("Found file names:"); |
|||
print_hits(files.search(query.iter()).map(|it| it.display())); |
|||
println!(); |
|||
} |
|||
} |
|||
} |
|||
Ok(()) |
|||
}</lang> |
|||
{{out}} |
|||
<pre>C:\Temp>inverted_index books html htm txt |
|||
Indexing {"htm", "html", "txt"} files in 'books' |
|||
Indexed 34810 tokens in 9 files. |
|||
Enter search query: |
|||
war |
|||
Found hits: |
|||
[1] books\EN\C\Chaucer, Geoffrey - Canterbury Tales.txt |
|||
[2] books\EN\H\Homer - The Odyssey.txt |
|||
[3] books\EN\O\Orwell, George - 1984.html |
|||
[4] books\EN\W\Wells, Herbert George - The Invisible Man.txt |
|||
[5] books\EN\W\Wells, Herbert George - The Island of Doctor Moreau.txt |
|||
[6] books\EN\W\Wells, Herbert George - The Time Machine.txt |
|||
[7] books\EN\W\Wells, Herbert George - War of the Worlds.txt |
|||
Found file names: |
|||
[1] books\EN\W\Wells, Herbert George - War of the Worlds.txt |
|||
Enter search query: |
|||
war gun |
|||
Found hits: |
|||
[1] books\EN\O\Orwell, George - 1984.html |
|||
[2] books\EN\W\Wells, Herbert George - War of the Worlds.txt |
|||
Found file names: |
|||
(none) |
|||
Enter search query: |
|||
/exit |
|||
C:\Temp></pre> |
|||
=={{header|Scala}}== |
=={{header|Scala}}== |
||
Line 2,934: | Line 3,209: | ||
array set localidx {} |
array set localidx {} |
||
foreach word [wordsInString $data] { |
foreach word [wordsInString $data] { |
||
lappend localidx($word) $i |
|||
incr i |
|||
} |
} |
||
# Transcribe into global index |
# Transcribe into global index |
||
foreach {word places} [array get localidx] { |
foreach {word places} [array get localidx] { |
||
dict set index($word) $filename $places |
|||
} |
} |
||
} |
} |
||
Line 2,948: | Line 3,223: | ||
global index |
global index |
||
if {[info exists index($word)]} { |
if {[info exists index($word)]} { |
||
return [dict keys $index($word)] |
|||
} |
} |
||
} |
} |
||
Line 2,956: | Line 3,231: | ||
set files [findFilesForWord [lindex $words 0]] |
set files [findFilesForWord [lindex $words 0]] |
||
foreach w [lrange $words 1 end] { |
foreach w [lrange $words 1 end] { |
||
set wf [findFilesForWord $w] |
|||
set newfiles {} |
|||
foreach f $files { |
|||
if {$f in $wf} {lappend newfiles $f} |
|||
} |
|||
} |
|||
set files $newfiles |
|||
} |
} |
||
return $files |
return $files |
||
Line 2,971: | Line 3,246: | ||
set files {} |
set files {} |
||
foreach w $words { |
foreach w $words { |
||
if {![info exist index($w)]} { |
|||
return |
|||
} |
|||
} |
|||
} |
} |
||
dict for {file places} $index([lindex $words 0]) { |
dict for {file places} $index([lindex $words 0]) { |
||
if {$file in $files} continue |
|||
foreach start $places { |
|||
set gotStart 1 |
|||
foreach w [lrange $words 1 end] { |
|||
incr start |
|||
set gotNext 0 |
|||
foreach {f ps} $index($w) { |
|||
if {$f ne $file} continue |
|||
foreach p $ps { |
|||
if {$p == $start} { |
|||
set gotNext 1 |
|||
break |
|||
} |
|||
} |
|||
} |
|||
if {$gotNext} break |
|||
} |
|||
} |
|||
if {!$gotNext} { |
|||
set gotStart 0 |
|||
break |
|||
} |
|||
} |
|||
} |
|||
if {$gotStart} { |
|||
lappend files $file |
|||
break |
|||
} |
|||
} |
|||
} |
|||
} |
} |
||
return $files |
return $files |
||
Line 3,014: | Line 3,289: | ||
pack [entry .found.entry -textvariable terms] -fill x |
pack [entry .found.entry -textvariable terms] -fill x |
||
pack [button .found.findAll -command FindAll \ |
pack [button .found.findAll -command FindAll \ |
||
-text "Find File with All"] -side left |
|||
pack [button .found.findSeq -command FindSeq \ |
pack [button .found.findSeq -command FindSeq \ |
||
-text "Find File with Sequence"] -side right |
|||
# The actions invoked by various GUI buttons |
# The actions invoked by various GUI buttons |
||
Line 3,023: | Line 3,298: | ||
set f [tk_getOpenFile] |
set f [tk_getOpenFile] |
||
if {$f ne ""} { |
if {$f ne ""} { |
||
addDocumentToIndex $f |
|||
lappend files $f |
|||
} |
} |
||
} |
} |
||
Line 3,032: | Line 3,307: | ||
set fs [findFilesWithAllWords $words] |
set fs [findFilesWithAllWords $words] |
||
lappend found "Searching for files with all $terms" {*}$fs \ |
lappend found "Searching for files with all $terms" {*}$fs \ |
||
"---------------------" |
|||
} |
} |
||
proc FindSeq {} { |
proc FindSeq {} { |
||
Line 3,039: | Line 3,314: | ||
set fs [findFilesWithWordSequence $words] |
set fs [findFilesWithWordSequence $words] |
||
lappend found "Searching for files with \"$terms\"" {*}$fs \ |
lappend found "Searching for files with \"$terms\"" {*}$fs \ |
||
"---------------------" |
|||
}</lang> |
}</lang> |
||
Line 3,139: | Line 3,414: | ||
# the record separator for $INDEX/all.tab). |
# the record separator for $INDEX/all.tab). |
||
for file in "$@"; do |
for file in "$@"; do |
||
# Use printf(1), not echo, because "$file" might start with |
|||
# a hyphen and become an option to echo. |
|||
test 0 -eq $(printf %s "$file" | wc -l) || { |
|||
printf '%s\n' "$file: newline in filename" >&2 |
|||
exit 1 |
|||
} |
|||
} |
|||
done |
done |
||
Line 3,153: | Line 3,428: | ||
fi=1 |
fi=1 |
||
for file in "$@"; do |
for file in "$@"; do |
||
printf %s "Indexing $file." >&2 |
|||
# all.tab maps $fi => $file |
|||
echo "$fi $file" >> "$INDEX/all.tab" |
|||
# Use punctuation ([:punct:]) and whitespace (IFS) |
|||
# to split tokens. |
|||
ti=1 |
|||
tr -s '[:punct:]' ' ' < "$file" | while read line; do |
|||
for token in $line; do |
|||
# Index token by position ($fi, $ti). Ignore |
|||
# error from mkdir(1) if directory exists. |
|||
mkdir "$INDEX/$token" 2>/dev/null |
|||
echo $ti >> "$INDEX/$token/$fi" |
|||
: $((ti += 1)) |
|||
# Show progress. Print a dot per 1000 tokens. |
|||
case "$ti" in |
|||
*000) printf . |
|||
esac |
|||
done |
|||
done |
|||
echo >&2 |
|||
: $((fi += 1)) |
|||
done</lang> |
done</lang> |
||
Line 3,188: | Line 3,463: | ||
want=sequence |
want=sequence |
||
while getopts aos name; do |
while getopts aos name; do |
||
case "$name" in |
|||
a) want=all;; |
|||
o) want=one;; |
|||
s) want=sequence;; |
|||
*) exit 2;; |
|||
esac |
|||
done |
done |
||
shift $((OPTIND - 1)) |
shift $((OPTIND - 1)) |
||
all() { |
all() { |
||
echo "TODO" |
|||
exit 2 |
|||
} |
} |
||
one() { |
one() { |
||
echo "TODO" |
|||
exit 2 |
|||
} |
} |
||
sequence() { |
sequence() { |
||
echo "TODO" |
|||
exit 2 |
|||
} |
} |
||