Inverted index: Difference between revisions

Line 5:

;Task:

Given a set of text files, implement a program to create an inverted index.

Also create a user interface to do a search using that inverted index which returns a list of files that contain the query term / terms.

The search index can be in memory.

Line 138:

Enter one or more words to search for; <return> to finish:

it

Found in the following files: 0.txt, 1.txt, 2.txt

Line 545:

Loop, %files%, 0,1

{

tooltip,%A_index% / 500

wordList := WordsIn(A_LoopFileFullPath)

InvertedIndex(wordList, A_loopFileFullpath)

}

Line 566:

WordsIn(docpath)

{

FileRead, content, %docpath%

spos = 1

Line 576:

this_wordList .= match "`n"

}

Sort, this_wordList, U

return this_wordList

}

Line 585:

global word2docs

loop, parse, words, `n,`r

{

if A_loopField =

continue

Line 609:

FileList$() = "BBCKEY0.TXT", "BBCKEY1.TXT", "BBCKEY2.TXT", \

\ "BBCKEY3.TXT", "BBCKEY4.TXT"

DictSize% = 30000

DIM Index{(DictSize%-1) word$, link%}

REM Build inverted index:

FOR file% = DIM(FileList$(),1) TO 0 STEP -1

Line 618:

F% = OPENIN(filename$)

IF F% = 0 ERROR 100, "Failed to open file"

WHILE NOT EOF#F%

REPEAT C%=BGET#F% : UNTIL C%>64 OR EOF#F% : word$ = CHR$(C%)

REPEAT C%=BGET#F% : word$ += CHR$(C%) : UNTIL C%<65

word$ = FNlower(LEFT$(word$))

hash% = FNhash(word$)

WHILE Index{(hash%)}.word$<>"" AND Index{(hash%)}.word$<>word$

Line 636:

ENDIF

ENDWHILE

CLOSE #F%

NEXT file%

REM Now query the index:

PRINT FNquery("random")

Line 646:

PRINT FNquery("the")

END

DEF FNquery(A$)

LOCAL hash%, link%, temp%

Line 663:

ENDWHILE

= LEFT$(LEFT$(A$))

DEF FNhash(A$)

LOCAL hash%

Line 670:

IF LEN(A$) > 4 hash% EOR= !(!^A$ + LEN(A$) - 4)

= hash% MOD DictSize%

DEF FNlower(A$)

LOCAL A%,C%

Line 694:

int chr_idx[256] = {0};

char idx_chr[256] = {0};

#define FNAME 0

typedef struct trie_t *trie, trie_t;

struct trie_t {

trie next[sizeof(chr_legal)]; /* next letter; slot 0 is for file name */

int eow;

};

trie trie_new() { return calloc(sizeof(trie_t), 1); }

#define find_word(r, w) trie_trav(r, w, 1)

/* tree traversal: returns node if end of word and matches string, optionally

Line 710:

trie trie_trav(trie root, const char * str, int no_create)

{

int c;

while (root) {

if ((c = str[0]) == '\0') {

if (!root->eow && no_create) return 0;

break;

}

if (! (c = chr_idx[c]) ) {

str++;

continue;

}

if (!root->next[c]) {

if (no_create) return 0;

root->next[c] = trie_new();

}

root = root->next[c];

str++;

}

return root;

}

/* complete traversal of whole tree, calling callback at each end of word node.

* similar method can be used to free nodes, had we wanted to do that.

Line 736:

int trie_all(trie root, char path[], int depth, int (*callback)(char *))

{

int i;

if (root->eow && !callback(path)) return 0;

for (i = 1; i < sizeof(chr_legal); i++) {

if (!root->next[i]) continue;

path[depth] = idx_chr[i];

path[depth + 1] = '\0';

if (!trie_all(root->next[i], path, depth + 1, callback))

return 0;

}

return 1;

}

void add_index(trie root, const char *word, const char *fname)

{

trie x = trie_trav(root, word, 0);

x->eow = 1;

if (!x->next[FNAME])

x->next[FNAME] = trie_new();

x = trie_trav(x->next[FNAME], fname, 0);

x->eow = 1;

}

int print_path(char *path)

{

printf(" %s", path);

return 1;

}

/* pretend we parsed text files and got lower cased words: dealing *

* with text file is a whole other animal and would make code too long */

const char *files[] = { "f1.txt", "source/f2.txt", "other_file" };

const char *text[][5] ={{ "it", "is", "what", "it", "is" },

{ "what", "is", "it", 0 },

{ "it", "is", "a", "banana", 0 }};

trie init_tables()

{

int i, j;

trie root = trie_new();

for (i = 0; i < sizeof(chr_legal); i++) {

chr_idx[(int)chr_legal[i]] = i + 1;

idx_chr[i + 1] = chr_legal[i];

}

/* Enable USE_ADVANCED_FILE_HANDLING to use advanced file handling.

* You need to have files named like above files[], with words in them

* like in text[][]. Case doesn't matter (told you it's advanced).

*/

#define USE_ADVANCED_FILE_HANDLING 0

#if USE_ADVANCED_FILE_HANDLING

void read_file(const char * fname) {

char cmd[1024];

char word[1024];

sprintf(cmd, "perl -p -e 'while(/(\\w+)/g) {print lc($1),\"\\n\"}' %s", fname);

FILE *in = popen(cmd, "r");

while (!feof(in)) {

fscanf(in, "%1000s", word);

add_index(root, word, fname);

}

pclose(in);

};

read_file("f1.txt");

read_file("source/f2.txt");

read_file("other_file");

#else

for (i = 0; i < 3; i++) {

for (j = 0; j < 5; j++) {

if (!text[i][j]) break;

add_index(root, text[i][j], files[i]);

}

#endif /*USE_ADVANCED_FILE_HANDLING*/

return root;

}

void search_index(trie root, const char *word)

{

char path[1024];

printf("Search for \"%s\": ", word);

trie found = find_word(root, word);

if (!found) printf("not found\n");

else {

trie_all(found->next[FNAME], path, 0, print_path);

printf("\n");

}

int main()

{

trie root = init_tables();

search_index(root, "what");

search_index(root, "is");

search_index(root, "banana");

search_index(root, "boo");

return 0;

}</lang>Output:<lang>Search for "what": f1.txt source/f2.txt

Search for "is": f1.txt other_file source/f2.txt

Line 889:

{

public:

node() { clear(); }

node( char z ) { clear(); }

~node() { for( int x = 0; x < MAX_NODES; x++ ) if( next[x] ) delete next[x]; }

Line 933:

const std::vector<std::string>& find( std::string s ) {

size_t idx;

std::transform( s.begin(), s.end(), s.begin(), tolower );

node* rt = &root;

for( std::string::iterator i = s.begin(); i != s.end(); i++ ) {

idx = _CHARS.find( *i );

if( idx < MAX_NODES ) {

if( !rt->next[idx] ) return std::vector<std::string>();

rt = rt->next[idx];

}

if( rt->isWord ) return rt->files;

return std::vector<std::string>();

Line 951:

idx = _CHARS.find( *i );

if( idx < MAX_NODES ) {

n = rt->next[idx];

if( n ){

rt = n;

continue;

}

n = new node( *i );

rt->next[idx] = n;

rt = n;

}

Line 983:

}

while( true ) {

std::cout << "Enter one word to search for, return to exit: ";

Line 1,023:

(defn term-seq [text] (map normalize (re-seq pattern text)))

(defn set-assoc

"Produces map with v added to the set associated with key k in map m"

[m k v] (assoc m k (conj (get m k #{}) v)))

Line 1,064:

console.log "#{fn}:#{line_num}"

console.log "\n"

get_words = (line) ->

words = line.replace(/\W/g, ' ').split ' '

Line 1,081:

output

<lang>

> coffee inverted_index.coffee

locations for 'make_index':

inverted_index.coffee:3

Line 1,211:

;; get text for each file, and call (action filename text)

(define (map-files action files)

(for ((file files))

(file->string action file)))

;; create store

(local-make-store INVERT)

; invert-word : word -> set of files

(define (invert-word word file store)

(local-put-value word

(make-set (cons file (local-get-value word store))) store))

; parse file text and invert each word

(define (invert-text file text)

(writeln 'Inverting file text)

(let ((text (text-parse text)))

(for ((word text)) (invert-word (string-downcase word) file INVERT))))

</lang>

Line 1,233:

Intersect sets values of each word.

;; usage : (inverted-search w1 w2 ..)

(define-syntax-rule (inverted-search w ...)

(and-get-invert (quote w )))

;; intersects all sets referenced by words

;; returns the intersection

(define (and-get-invert words)

(foldl

(lambda(word res)

(set-intersect res (local-get-value word INVERT)))

FILES words))

</lang>

Output :

Line 1,271:

lists:foldl( fun import_from_file/2, dict:new(), Files ).

search( Binaries, Inverted_index ) ->

[Files | T] = [dict:fetch(X, Inverted_index) || X <- Binaries],

lists:foldl( fun search_common/2, Files, T ).

Line 1,287:

import_from_file( File, Dict_acc ) ->

New_dict = dict:from_list( import_from_file_contents(File, file:read_file(File)) ),

dict:merge( fun import_from_file_merge/3, Dict_acc, New_dict ).

import_from_file_contents( File, {ok, Binary} ) ->

[{X, [File]} || X <- binary:split( Binary, binary:compile_pattern([<<" ">>, <<"\n">>]), [global] )];

import_from_file_contents( File, {error, Error} ) ->

io:fwrite( "Error: could not open file ~p: ~p~nContinuing with the rest of them~n", [File, Error] ),

[].

import_from_file_merge( _Key, Files, [New_file] ) -> [New_file | Files].

search_common( Files, Acc ) -> [X || X <- Acc, lists:member(X, Files)].

Line 1,306:

// Map search terms to associated set of files

type searchIndexMap = Map<string, Set<string>>

let inputSearchCriteria() =

let readLine prompt =

printf "%s: " prompt

Console.ReadLine().Split()

readLine "Files", (readLine "Find") |> Array.map (fun s -> s.ToLower())

let updateIndex indexMap keyValuePair =

let k, v = keyValuePair

match Map.tryFind k indexMap with

| None -> Map.add k (Set.singleton v) indexMap

| Some set -> Map.add k (Set.add v set) indexMap

let buildIndex files =

let fileData file =

File.ReadAllText(file).Split() |> Seq.map (fun word -> word.ToLower(), file)

files |> Seq.collect fileData

|> Seq.fold updateIndex Map.empty

let searchFiles() =

let files, terms = inputSearchCriteria()

Line 1,481:

}

return true

}

func ui() {

fmt.Println(len(index), "words indexed in", len(indexed), "files")

Line 1,500:

fmt.Println("one match:")

fmt.Println(" ", indexed[dl[0]].file, indexed[dl[0]].title)

default:

fmt.Println(len(dl), "matches:")

for _, d := range dl {

Line 1,580:

every textname := key(texts) do # build index for each 'text'

SII := InvertedIndex(SII,textname,texts[textname])

TermSearchUI(SII) # search UI

end

Line 1,601:

repeat {

writes("Enter search terms (^z to quit) : ")

terms := map(trim(read() | break))

x := []

terms ? while not pos(0) do {

tab(many(' \t'))

put(x,tab(upto('\ \t')|0))

}

show("Searching for : ",x)

show("Found in : ",s := TermSearch(ii,x)) | show("Not found : ",x)

}

write("End of search")

Line 1,617:

procedure TermSearch(ii,x) #: return set of matches or fail

every s := !x do

( /u := ii[s] ) | (u **:= ii[s])

if *u > 0 then return u

Line 1,625:

every writes(s|!x) do writes(" ")

write()

return

end</lang>

Line 1,722:

public class InvertedIndex {

List<String> stopwords = Arrays.asList("a", "able", "about",

"across", "after", "all", "almost", "also", "am", "among", "an",

"and", "any", "are", "as", "at", "be", "because", "been", "but",

"by", "can", "cannot", "could", "dear", "did", "do", "does",

"either", "else", "ever", "every", "for", "from", "get", "got",

"had", "has", "have", "he", "her", "hers", "him", "his", "how",

"however", "i", "if", "in", "into", "is", "it", "its", "just",

"least", "let", "like", "likely", "may", "me", "might", "most",

"must", "my", "neither", "no", "nor", "not", "of", "off", "often",

"on", "only", "or", "other", "our", "own", "rather", "said", "say",

"says", "she", "should", "since", "so", "some", "than", "that",

"the", "their", "them", "then", "there", "these", "they", "this",

"tis", "to", "too", "twas", "us", "wants", "was", "we", "were",

"what", "when", "where", "which", "while", "who", "whom", "why",

"will", "with", "would", "yet", "you", "your");

Map<String, List<Tuple>> index = new HashMap<String, List<Tuple>>();

List<String> files = new ArrayList<String>();

public void indexFile(File file) throws IOException {

int fileno = files.indexOf(file.getPath());

if (fileno == -1) {

files.add(file.getPath());

fileno = files.size() - 1;

}

int pos = 0;

BufferedReader reader = new BufferedReader(new FileReader(file));

for (String line = reader.readLine(); line != null; line = reader

.readLine()) {

for (String _word : line.split("\\W+")) {

String word = _word.toLowerCase();

pos++;

if (stopwords.contains(word))

continue;

List<Tuple> idx = index.get(word);

if (idx == null) {

idx = new LinkedList<Tuple>();

index.put(word, idx);

}

idx.add(new Tuple(fileno, pos));

}

System.out.println("indexed " + file.getPath() + " " + pos + " words");

}

public void search(List<String> words) {

for (String _word : words) {

Set<String> answer = new HashSet<String>();

String word = _word.toLowerCase();

List<Tuple> idx = index.get(word);

if (idx != null) {

for (Tuple t : idx) {

answer.add(files.get(t.fileno));

}

System.out.print(word);

for (String f : answer) {

System.out.print(" " + f);

}

System.out.println("");

}

public static void main(String[] args) {

try {

InvertedIndex idx = new InvertedIndex();

for (int i = 1; i < args.length; i++) {

idx.indexFile(new File(args[i]));

}

idx.search(Arrays.asList(args[0].split(",")));

} catch (Exception e) {

e.printStackTrace();

}

private class Tuple {

private int fileno;

private int position;

public Tuple(int fileno, int position) {

this.fileno = fileno;

this.position = position;

}

Line 1,813:

Example output:

java -cp bin org.rosettacode.InvertedIndex "huntsman,merit,dog,the,gutenberg,lovecraft,olympian" pg30637.txt pg7025.txt pg82.txt pg9090.txt

indexed pg30637.txt 106473 words

indexed pg7025.txt 205714 words

Line 1,848:

def overlap(that): . as $this

| reduce that[] as $item ([]; if $this|index($item) then . + [$item] else . end);

. as $dict

| if (words|length) == 0 then []

Line 1,864:

<lang jq>def prompt_search:

"Enter a string or an array of strings to search for, quoting each string, or 0 to exit:",

( (input | if type == "array" then . elif type == "string" then [.]

else empty

end) as $in

Line 1,997:

Enter word(s) to be searched for in these files or 'q' to quit

? : cat

'cat' not found

Line 2,046:

unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma \

inv.ml

ocamlc -o inv.byte unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma inv.cmo

Line 2,143:

foreach my $file (@files)

{

open(F, "<", $file) or die "Can't read file $file: $!";

while(<F>) {

s/\A\W+//;

foreach my $w (map {lc} grep {length() >= 3} split /\W+/)

{

if ( exists($iindex{$w}) )

{

$iindex{$w}->insert($file);

} else {

$iindex{$w} = set($file);

}

close(F);

}

return %iindex;

Line 2,167:

my @words = @_;

my $res = set();

foreach my $w (map {lc} @words)

{

$w =~ s/\W+//g; # strip non-words chars

length $w < 3 and next;

exists $idx{$w} or return set();

Line 2,189:

=={{header|Phix}}==

The following is included in the distro as demo\rosetta\Inverted_index.exw.<br>

Loads all text files in demo\rosetta\ and builds a list of filenames and

a dictionary of {word,file_indexes}, before a handful of quick tests.<br>

Might be better (and almost as easy) for the dictionary values to be say

Line 2,376:

# Create index hashtable, as needed

If ( -not $Script:WordIndex ) { $Script:WordIndex = @{} }

# For each file to be indexed...

ForEach ( $File in $FileList )

Line 2,382:

# Find any previously existing entries for this file

$ExistingEntries = $Script:WordIndex.Keys | Where { $Script:WordIndex[$_] -contains $File }

# For any previously existing entries

# Delete them (prior to reindexing the file)

Line 2,389:

$Script:WordIndex[$Key] = @( $Script:WordIndex[$Key] | Where { $_ -ne $File } )

}

# Get the contents of the file, split on non-alphanumeric characters, and remove duplicates

$Words = ( Get-Content $File ) -split '[^a-zA-Z\d]' | Sort -Unique

# For each word in the file...

ForEach ( $Word in $Words )

Line 2,410:

}

function Find-Word ( [string]$Word )

{

Line 2,420:

various words.

'@ | Out-File -FilePath C:\Test\File1.txt

@'

Create an index

of words.

'@ | Out-File -FilePath C:\Test\File2.txt

@'

Use the index

Line 2,628:

<lang perl6>sub MAIN (*@files) {

my %norm;

do for @files -> $file {

%norm.push: $file X=> slurp($file).lc.words;

Line 2,862:

> ./indexsearch.rb It iS\!

["file1", "file2", "file3"]</pre>

=={{header|Rust}}==

<lang Rust>// Part 1: Inverted index structure

use std::{

borrow::Borrow,

collections::{BTreeMap, BTreeSet},

};

#[derive(Debug, Default)]

pub struct InvertedIndex<T> {

indexed: BTreeMap<String, BTreeSet<usize>>,

sources: Vec<T>,

}

impl<T> InvertedIndex<T> {

pub fn add<I, V>(&mut self, source: T, tokens: I)

where

I: IntoIterator<Item = V>,

V: Into<String>,

{

let source_id = self.sources.len();

self.sources.push(source);

for token in tokens {

self.indexed

.entry(token.into())

.or_insert_with(BTreeSet::new)

.insert(source_id);

}

pub fn search<'a, I, K>(&self, tokens: I) -> impl Iterator<Item = &T>

where

String: Borrow<K>,

K: Ord + ?Sized + 'a,

I: IntoIterator<Item = &'a K>,

{

let mut tokens = tokens.into_iter();

tokens

.next()

.and_then(|token| self.indexed.get(token).cloned())

.and_then(|first| {

tokens.try_fold(first, |found, token| {

self.indexed

.get(token)

.map(|sources| {

found

.intersection(sources)

.cloned()

.collect::<BTreeSet<_>>()

})

.filter(|update| !update.is_empty())

})

.unwrap_or_else(BTreeSet::new)

.into_iter()

.map(move |source| &self.sources[source])

}

pub fn tokens(&self) -> impl Iterator<Item = &str> {

self.indexed.keys().map(|it| it.as_str())

}

pub fn sources(&self) -> &[T] {

&self.sources

}

// Part 2: File walking and processing

use std::{

ffi::OsString,

fmt::{Debug, Display},

fs::{read_dir, DirEntry, File, ReadDir},

io::{self, stdin, Read},

path::{Path, PathBuf},

};

#[derive(Debug)]

pub struct Files {

dirs: Vec<ReadDir>,

}

impl Files {

pub fn walk<P: AsRef<Path>>(path: P) -> io::Result<Self> {

Ok(Files {

dirs: vec![read_dir(path)?],

})

}

impl Iterator for Files {

type Item = DirEntry;

fn next(&mut self) -> Option<Self::Item> {

'outer: while let Some(mut current) = self.dirs.pop() {

while let Some(entry) = current.next() {

if let Ok(entry) = entry {

let path = entry.path();

if !path.is_dir() {

self.dirs.push(current);

return Some(entry);

} else if let Ok(dir) = read_dir(path) {

self.dirs.push(current);

self.dirs.push(dir);

continue 'outer;

}

None // No directory left

}

fn tokenize<'a>(input: &'a str) -> impl Iterator<Item = String> + 'a {

input

.split(|c: char| !c.is_alphanumeric())

.filter(|token| !token.is_empty())

.map(|token| token.to_lowercase())

}

fn tokenize_file<P: AsRef<Path>>(path: P) -> io::Result<BTreeSet<String>> {

let mut buffer = Vec::new();

File::open(path)?.read_to_end(&mut buffer)?;

let text = String::from_utf8_lossy(&buffer);

Ok(tokenize(&text).collect::<BTreeSet<_>>())

}

fn tokenize_query(input: &str) -> Vec<String> {

let result = tokenize(input).collect::<BTreeSet<_>>();

// Make a vector sorted by length, so that longer tokens are processed first.

// This heuristics should narrow the resulting set faster.

let mut result = result.into_iter().collect::<Vec<_>>();

result.sort_by_key(|item| usize::MAX - item.len());

result

}

// Part 3: Interactive application

fn args() -> io::Result<(OsString, BTreeSet<OsString>)> {

let mut args = std::env::args_os().skip(1); // Skip the executable's name

let path = args

.next()

.ok_or_else(|| io::Error::new(io::ErrorKind::Other, "missing path"))?;

let extensions = args.collect::<BTreeSet<_>>();

Ok((path, extensions))

}

fn print_hits<'a, T>(hits: impl Iterator<Item = T>)

where

T: Display,

{

let mut found_none = true;

for (number, hit) in hits.enumerate() {

println!(" [{}] {}", number + 1, hit);

found_none = false;

}

if found_none {

println!("(none)")

}

fn main() -> io::Result<()> {

let (path, extensions) = args()?;

let mut files = InvertedIndex::<PathBuf>::default();

let mut content = InvertedIndex::<PathBuf>::default();

println!(

"Indexing {:?} files in '{}'",

extensions,

path.to_string_lossy()

);

for path in Files::walk(path)?.map(|file| file.path()).filter(|path| {

path.extension()

.filter(|&ext| extensions.is_empty() || extensions.contains(ext))

.is_some()

}) {

files.add(path.clone(), tokenize(&path.to_string_lossy()));

match tokenize_file(&path) {

Ok(tokens) => content.add(path, tokens),

Err(e) => eprintln!("Skipping a file {}: {}", path.display(), e),

}

println!(

"Indexed {} tokens in {} files.",

content.tokens().count(),

content.sources.len()

);

// Run the query UI loop

let mut query = String::new();

loop {

query.clear();

println!("Enter search query:");

if stdin().read_line(&mut query).is_err() || query.trim().is_empty() {

break;

}

match query.trim() {

"/exit" | "/quit" | "" => break,

"/tokens" => {

println!("Tokens:");

for token in content.tokens() {

println!("{}", token);

}

println!();

}

"/files" => {

println!("Sources:");

for source in content.sources() {

println!("{}", source.display());

}

println!();

}

_ => {

let query = tokenize_query(&query);

println!();

println!("Found hits:");

print_hits(content.search(query.iter()).map(|it| it.display()));

println!("Found file names:");

print_hits(files.search(query.iter()).map(|it| it.display()));

println!();

}

Ok(())

}</lang>

<pre>C:\Temp>inverted_index books html htm txt

Indexing {"htm", "html", "txt"} files in 'books'

Indexed 34810 tokens in 9 files.

Enter search query:

war

Found hits:

[1] books\EN\C\Chaucer, Geoffrey - Canterbury Tales.txt

[2] books\EN\H\Homer - The Odyssey.txt

[3] books\EN\O\Orwell, George - 1984.html

[4] books\EN\W\Wells, Herbert George - The Invisible Man.txt

[5] books\EN\W\Wells, Herbert George - The Island of Doctor Moreau.txt

[6] books\EN\W\Wells, Herbert George - The Time Machine.txt

[7] books\EN\W\Wells, Herbert George - War of the Worlds.txt

Found file names:

[1] books\EN\W\Wells, Herbert George - War of the Worlds.txt

Enter search query:

war gun

Found hits:

[1] books\EN\O\Orwell, George - 1984.html

[2] books\EN\W\Wells, Herbert George - War of the Worlds.txt

Found file names:

(none)

Enter search query:

/exit

C:\Temp></pre>

=={{header|Scala}}==

Line 2,934:

Line 3,209:

array set localidx {}

foreach word [wordsInString $data] {

lappend localidx($word) $i

incr i

}

# Transcribe into global index

foreach {word places} [array get localidx] {

dict set index($word) $filename $places

}

Line 2,948:

Line 3,223:

global index

if {[info exists index($word)]} {

return [dict keys $index($word)]

}

Line 2,956:

Line 3,231:

set files [findFilesForWord [lindex $words 0]]

foreach w [lrange $words 1 end] {

set wf [findFilesForWord $w]

set newfiles {}

foreach f $files {

if {$f in $wf} {lappend newfiles $f}

}

set files $newfiles

}

return $files

Line 2,971:

Line 3,246:

set files {}

foreach w $words {

if {![info exist index($w)]} {

return

}

dict for {file places} $index([lindex $words 0]) {

if {$file in $files} continue

foreach start $places {

set gotStart 1

foreach w [lrange $words 1 end] {

incr start

set gotNext 0

foreach {f ps} $index($w) {

if {$f ne $file} continue

foreach p $ps {

if {$p == $start} {

set gotNext 1

break

}

if {$gotNext} break

}

if {!$gotNext} {

set gotStart 0

break

}

if {$gotStart} {

lappend files $file

break

}

return $files

Line 3,014:

Line 3,289:

pack [entry .found.entry -textvariable terms] -fill x

pack [button .found.findAll -command FindAll \

-text "Find File with All"] -side left

pack [button .found.findSeq -command FindSeq \

-text "Find File with Sequence"] -side right

# The actions invoked by various GUI buttons

Line 3,023:

Line 3,298:

set f [tk_getOpenFile]

if {$f ne ""} {

addDocumentToIndex $f

lappend files $f

}

Line 3,032:

Line 3,307:

set fs [findFilesWithAllWords $words]

lappend found "Searching for files with all $terms" {*}$fs \

"---------------------"

}

proc FindSeq {} {

Line 3,039:

Line 3,314:

set fs [findFilesWithWordSequence $words]

lappend found "Searching for files with \"$terms\"" {*}$fs \

"---------------------"

}</lang>

Line 3,139:

Line 3,414:

# the record separator for $INDEX/all.tab).

for file in "$@"; do

# Use printf(1), not echo, because "$file" might start with

# a hyphen and become an option to echo.

test 0 -eq $(printf %s "$file" | wc -l) || {

printf '%s\n' "$file: newline in filename" >&2

exit 1

}

done

Line 3,153:

Line 3,428:

fi=1

for file in "$@"; do

printf %s "Indexing $file." >&2

# all.tab maps $fi => $file

echo "$fi $file" >> "$INDEX/all.tab"

# Use punctuation ([:punct:]) and whitespace (IFS)

# to split tokens.

ti=1

tr -s '[:punct:]' ' ' < "$file" | while read line; do

for token in $line; do

# Index token by position ($fi, $ti). Ignore

# error from mkdir(1) if directory exists.

mkdir "$INDEX/$token" 2>/dev/null

echo $ti >> "$INDEX/$token/$fi"

: $((ti += 1))

# Show progress. Print a dot per 1000 tokens.

case "$ti" in

*000) printf .

esac

done

echo >&2

: $((fi += 1))

done</lang>

Line 3,188:

Line 3,463:

want=sequence

while getopts aos name; do

case "$name" in

a) want=all;;

o) want=one;;

s) want=sequence;;

*) exit 2;;

esac

done

shift $((OPTIND - 1))

all() {

echo "TODO"

exit 2

}

one() {

echo "TODO"

exit 2

}

sequence() {

echo "TODO"

exit 2

}