Inverted index: Difference between revisions
Content added Content deleted
Thundergnat (talk | contribs) (Rename Perl 6 -> Raku, alphabetize, minor clean-up) |
(Add source for Rust) |
||
Line 5:
;Task:
Given a set of text files, implement a program to create an inverted index.
Also create a user interface to do a search using that inverted index which returns a list of files that contain the query term / terms.
The search index can be in memory.
<br><br>
Line 138:
Enter one or more words to search for; <return> to finish:
it
Found in the following files: 0.txt, 1.txt, 2.txt
Line 545:
Loop, %files%, 0,1
{
tooltip,%A_index% / 500
wordList := WordsIn(A_LoopFileFullPath)
InvertedIndex(wordList, A_loopFileFullpath)
}
Line 566:
WordsIn(docpath)
{
FileRead, content, %docpath%
spos = 1
Line 576:
this_wordList .= match "`n"
}
Sort, this_wordList, U
return this_wordList
}
Line 585:
global word2docs
loop, parse, words, `n,`r
{
if A_loopField =
continue
Line 609:
FileList$() = "BBCKEY0.TXT", "BBCKEY1.TXT", "BBCKEY2.TXT", \
\ "BBCKEY3.TXT", "BBCKEY4.TXT"
DictSize% = 30000
DIM Index{(DictSize%-1) word$, link%}
REM Build inverted index:
FOR file% = DIM(FileList$(),1) TO 0 STEP -1
Line 618:
F% = OPENIN(filename$)
IF F% = 0 ERROR 100, "Failed to open file"
WHILE NOT EOF#F%
REPEAT C%=BGET#F% : UNTIL C%>64 OR EOF#F% : word$ = CHR$(C%)
REPEAT C%=BGET#F% : word$ += CHR$(C%) : UNTIL C%<65
word$ = FNlower(LEFT$(word$))
hash% = FNhash(word$)
WHILE Index{(hash%)}.word$<>"" AND Index{(hash%)}.word$<>word$
Line 636:
ENDIF
ENDWHILE
CLOSE #F%
NEXT file%
REM Now query the index:
PRINT FNquery("random")
Line 646:
PRINT FNquery("the")
END
DEF FNquery(A$)
LOCAL hash%, link%, temp%
Line 663:
ENDWHILE
= LEFT$(LEFT$(A$))
DEF FNhash(A$)
LOCAL hash%
Line 670:
IF LEN(A$) > 4 hash% EOR= !(!^A$ + LEN(A$) - 4)
= hash% MOD DictSize%
DEF FNlower(A$)
LOCAL A%,C%
Line 694:
int chr_idx[256] = {0};
char idx_chr[256] = {0};
#define FNAME 0
typedef struct trie_t *trie, trie_t;
struct trie_t {
};
trie trie_new() { return calloc(sizeof(trie_t), 1); }
#define find_word(r, w) trie_trav(r, w, 1)
/* tree traversal: returns node if end of word and matches string, optionally
Line 710:
trie trie_trav(trie root, const char * str, int no_create)
{
}
}
}
}
}
/* complete traversal of whole tree, calling callback at each end of word node.
* similar method can be used to free nodes, had we wanted to do that.
Line 736:
int trie_all(trie root, char path[], int depth, int (*callback)(char *))
{
}
}
void add_index(trie root, const char *word, const char *fname)
{
}
int print_path(char *path)
{
}
/* pretend we parsed text files and got lower cased words: dealing *
* with text file is a whole other animal and would make code too long */
const char *files[] = { "f1.txt", "source/f2.txt", "other_file" };
const char *text[][5] ={{ "it", "is", "what", "it", "is" },
trie init_tables()
{
}
/* Enable USE_ADVANCED_FILE_HANDLING to use advanced file handling.
* You need to have files named like above files[], with words in them
* like in text[][]. Case doesn't matter (told you it's advanced).
*/
#define USE_ADVANCED_FILE_HANDLING 0
#if USE_ADVANCED_FILE_HANDLING
}
#else
}
}
#endif /*USE_ADVANCED_FILE_HANDLING*/
}
void search_index(trie root, const char *word)
{
}
}
int main()
{
}</lang>Output:<lang>Search for "what": f1.txt source/f2.txt
Search for "is": f1.txt other_file source/f2.txt
Line 889:
{
public:
node() { clear(); }
node( char z ) { clear(); }
~node() { for( int x = 0; x < MAX_NODES; x++ ) if( next[x] ) delete next[x]; }
Line 933:
const std::vector<std::string>& find( std::string s ) {
size_t idx;
std::transform( s.begin(), s.end(), s.begin(), tolower );
node* rt = &root;
for( std::string::iterator i = s.begin(); i != s.end(); i++ ) {
idx = _CHARS.find( *i );
if( idx < MAX_NODES ) {
if( !rt->next[idx] ) return std::vector<std::string>();
rt = rt->next[idx];
}
}
if( rt->isWord ) return rt->files;
return std::vector<std::string>();
Line 951:
idx = _CHARS.find( *i );
if( idx < MAX_NODES ) {
n = rt->next[idx];
if( n ){
rt = n;
continue;
}
n = new node( *i );
rt->next[idx] = n;
rt = n;
}
}
Line 983:
}
}
while( true ) {
std::cout << "Enter one word to search for, return to exit: ";
Line 1,023:
(defn term-seq [text] (map normalize (re-seq pattern text)))
(defn set-assoc
"Produces map with v added to the set associated with key k in map m"
[m k v] (assoc m k (conj (get m k #{}) v)))
Line 1,064:
console.log "#{fn}:#{line_num}"
console.log "\n"
get_words = (line) ->
words = line.replace(/\W/g, ' ').split ' '
Line 1,081:
output
<lang>
> coffee inverted_index.coffee
locations for 'make_index':
inverted_index.coffee:3
Line 1,211:
;; get text for each file, and call (action filename text)
(define (map-files action files)
;; create store
(local-make-store INVERT)
; invert-word : word -> set of files
(define (invert-word word file store)
; parse file text and invert each word
(define (invert-text file text)
</lang>
Line 1,233:
Intersect sets values of each word.
<lang lisp>
;; usage : (inverted-search w1 w2 ..)
(define-syntax-rule (inverted-search w ...)
;; intersects all sets referenced by words
;; returns the intersection
(define (and-get-invert words)
</lang>
Output :
Line 1,271:
lists:foldl( fun import_from_file/2, dict:new(), Files ).
search(
[Files | T] = [dict:fetch(X, Inverted_index) || X <- Binaries],
lists:foldl( fun search_common/2, Files, T ).
Line 1,287:
import_from_file( File, Dict_acc ) ->
New_dict = dict:from_list( import_from_file_contents(File, file:read_file(File)) ),
import_from_file_contents( File, {ok, Binary} ) ->
[{X, [File]} || X <- binary:split( Binary, binary:compile_pattern([<<" ">>, <<"\n">>]), [global] )];
import_from_file_contents( File, {error, Error} ) ->
import_from_file_merge(
search_common( Files, Acc ) -> [X || X <- Acc, lists:member(X, Files)].
Line 1,306:
// Map search terms to associated set of files
type searchIndexMap = Map<string, Set<string>>
let inputSearchCriteria() =
let readLine prompt =
printf "%s: " prompt
Console.ReadLine().Split()
readLine "Files", (readLine "Find") |> Array.map (fun s -> s.ToLower())
let updateIndex indexMap keyValuePair =
let k, v = keyValuePair
match Map.tryFind k indexMap with
| None -> Map.add k (Set.singleton v) indexMap
| Some set -> Map.add k (Set.add v set) indexMap
let buildIndex files =
let fileData file =
File.ReadAllText(file).Split() |> Seq.map (fun word -> word.ToLower(), file)
files |> Seq.collect fileData
|> Seq.fold updateIndex Map.empty
let searchFiles() =
let files, terms = inputSearchCriteria()
Line 1,481:
}
return true
}
func ui() {
fmt.Println(len(index), "words indexed in", len(indexed), "files")
Line 1,500:
fmt.Println("one match:")
fmt.Println(" ", indexed[dl[0]].file, indexed[dl[0]].title)
default:
fmt.Println(len(dl), "matches:")
for _, d := range dl {
Line 1,580:
every textname := key(texts) do # build index for each 'text'
SII := InvertedIndex(SII,textname,texts[textname])
TermSearchUI(SII) # search UI
end
Line 1,601:
repeat {
writes("Enter search terms (^z to quit) : ")
terms := map(trim(read() | break))
x := []
terms ? while not pos(0) do {
tab(many(' \t'))
put(x,tab(upto('\ \t')|0))
}
show("Searching for : ",x)
show("Found in : ",s := TermSearch(ii,x)) | show("Not found : ",x)
}
write("End of search")
Line 1,617:
procedure TermSearch(ii,x) #: return set of matches or fail
every s := !x do
( /u := ii[s] ) | (u **:= ii[s])
if *u > 0 then return u
Line 1,625:
every writes(s|!x) do writes(" ")
write()
return
end</lang>
Line 1,722:
public class InvertedIndex {
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
Line 1,813:
Example output:
<lang Java>
java -cp bin org.rosettacode.InvertedIndex "huntsman,merit,dog,the,gutenberg,lovecraft,olympian" pg30637.txt pg7025.txt pg82.txt pg9090.txt
indexed pg30637.txt 106473 words
indexed pg7025.txt 205714 words
Line 1,848:
def overlap(that): . as $this
| reduce that[] as $item ([]; if $this|index($item) then . + [$item] else . end);
. as $dict
| if (words|length) == 0 then []
Line 1,864:
<lang jq>def prompt_search:
"Enter a string or an array of strings to search for, quoting each string, or 0 to exit:",
( (input | if type == "array" then . elif type == "string" then [.]
else empty
end) as $in
Line 1,997:
Enter word(s) to be searched for in these files or 'q' to quit
? : cat
'cat' not found
Line 2,046:
unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma \
inv.ml
ocamlc -o inv.byte unix.cma bigarray.cma nums.cma -I +sexplib sexplib.cma str.cma inv.cmo
Line 2,143:
foreach my $file (@files)
{
s/\A\W+//;
{
}
}
}
return %iindex;
Line 2,167:
my @words = @_;
my $res = set();
foreach my $w (map {lc} @words)
{
length $w < 3 and next;
exists $idx{$w} or return set();
Line 2,189:
=={{header|Phix}}==
The following is included in the distro as demo\rosetta\Inverted_index.exw.<br>
Loads all text files in demo\rosetta\ and builds a list of filenames and
a dictionary of {word,file_indexes}, before a handful of quick tests.<br>
Might be better (and almost as easy) for the dictionary values to be say
Line 2,376:
# Create index hashtable, as needed
If ( -not $Script:WordIndex ) { $Script:WordIndex = @{} }
# For each file to be indexed...
ForEach ( $File in $FileList )
Line 2,382:
# Find any previously existing entries for this file
$ExistingEntries = $Script:WordIndex.Keys | Where { $Script:WordIndex[$_] -contains $File }
# For any previously existing entries
# Delete them (prior to reindexing the file)
Line 2,389:
$Script:WordIndex[$Key] = @( $Script:WordIndex[$Key] | Where { $_ -ne $File } )
}
# Get the contents of the file, split on non-alphanumeric characters, and remove duplicates
$Words = ( Get-Content $File ) -split '[^a-zA-Z\d]' | Sort -Unique
# For each word in the file...
ForEach ( $Word in $Words )
Line 2,410:
}
}
function Find-Word ( [string]$Word )
{
Line 2,420:
various words.
'@ | Out-File -FilePath C:\Test\File1.txt
@'
Create an index
of words.
'@ | Out-File -FilePath C:\Test\File2.txt
@'
Use the index
Line 2,628:
{{works with|rakudo|2015-09-16}}
<lang perl6>sub MAIN (*@files) {
my %norm;
do for @files -> $file {
%norm.push: $file X=> slurp($file).lc.words;
Line 2,862:
> ./indexsearch.rb It iS\!
["file1", "file2", "file3"]</pre>
=={{header|Rust}}==
<lang Rust>// Part 1: Inverted index structure
use std::{
borrow::Borrow,
collections::{BTreeMap, BTreeSet},
};
#[derive(Debug, Default)]
pub struct InvertedIndex<T> {
indexed: BTreeMap<String, BTreeSet<usize>>,
sources: Vec<T>,
}
impl<T> InvertedIndex<T> {
pub fn add<I, V>(&mut self, source: T, tokens: I)
where
I: IntoIterator<Item = V>,
V: Into<String>,
{
let source_id = self.sources.len();
self.sources.push(source);
for token in tokens {
self.indexed
.entry(token.into())
.or_insert_with(BTreeSet::new)
.insert(source_id);
}
}
pub fn search<'a, I, K>(&self, tokens: I) -> impl Iterator<Item = &T>
where
String: Borrow<K>,
K: Ord + ?Sized + 'a,
I: IntoIterator<Item = &'a K>,
{
let mut tokens = tokens.into_iter();
tokens
.next()
.and_then(|token| self.indexed.get(token).cloned())
.and_then(|first| {
tokens.try_fold(first, |found, token| {
self.indexed
.get(token)
.map(|sources| {
found
.intersection(sources)
.cloned()
.collect::<BTreeSet<_>>()
})
.filter(|update| !update.is_empty())
})
})
.unwrap_or_else(BTreeSet::new)
.into_iter()
.map(move |source| &self.sources[source])
}
pub fn tokens(&self) -> impl Iterator<Item = &str> {
self.indexed.keys().map(|it| it.as_str())
}
pub fn sources(&self) -> &[T] {
&self.sources
}
}
// Part 2: File walking and processing
use std::{
ffi::OsString,
fmt::{Debug, Display},
fs::{read_dir, DirEntry, File, ReadDir},
io::{self, stdin, Read},
path::{Path, PathBuf},
};
#[derive(Debug)]
pub struct Files {
dirs: Vec<ReadDir>,
}
impl Files {
pub fn walk<P: AsRef<Path>>(path: P) -> io::Result<Self> {
Ok(Files {
dirs: vec![read_dir(path)?],
})
}
}
impl Iterator for Files {
type Item = DirEntry;
fn next(&mut self) -> Option<Self::Item> {
'outer: while let Some(mut current) = self.dirs.pop() {
while let Some(entry) = current.next() {
if let Ok(entry) = entry {
let path = entry.path();
if !path.is_dir() {
self.dirs.push(current);
return Some(entry);
} else if let Ok(dir) = read_dir(path) {
self.dirs.push(current);
self.dirs.push(dir);
continue 'outer;
}
}
}
}
None // No directory left
}
}
fn tokenize<'a>(input: &'a str) -> impl Iterator<Item = String> + 'a {
input
.split(|c: char| !c.is_alphanumeric())
.filter(|token| !token.is_empty())
.map(|token| token.to_lowercase())
}
fn tokenize_file<P: AsRef<Path>>(path: P) -> io::Result<BTreeSet<String>> {
let mut buffer = Vec::new();
File::open(path)?.read_to_end(&mut buffer)?;
let text = String::from_utf8_lossy(&buffer);
Ok(tokenize(&text).collect::<BTreeSet<_>>())
}
fn tokenize_query(input: &str) -> Vec<String> {
let result = tokenize(input).collect::<BTreeSet<_>>();
// Make a vector sorted by length, so that longer tokens are processed first.
// This heuristics should narrow the resulting set faster.
let mut result = result.into_iter().collect::<Vec<_>>();
result.sort_by_key(|item| usize::MAX - item.len());
result
}
// Part 3: Interactive application
fn args() -> io::Result<(OsString, BTreeSet<OsString>)> {
let mut args = std::env::args_os().skip(1); // Skip the executable's name
let path = args
.next()
.ok_or_else(|| io::Error::new(io::ErrorKind::Other, "missing path"))?;
let extensions = args.collect::<BTreeSet<_>>();
Ok((path, extensions))
}
fn print_hits<'a, T>(hits: impl Iterator<Item = T>)
where
T: Display,
{
let mut found_none = true;
for (number, hit) in hits.enumerate() {
println!(" [{}] {}", number + 1, hit);
found_none = false;
}
if found_none {
println!("(none)")
}
}
fn main() -> io::Result<()> {
let (path, extensions) = args()?;
let mut files = InvertedIndex::<PathBuf>::default();
let mut content = InvertedIndex::<PathBuf>::default();
println!(
"Indexing {:?} files in '{}'",
extensions,
path.to_string_lossy()
);
for path in Files::walk(path)?.map(|file| file.path()).filter(|path| {
path.extension()
.filter(|&ext| extensions.is_empty() || extensions.contains(ext))
.is_some()
}) {
files.add(path.clone(), tokenize(&path.to_string_lossy()));
match tokenize_file(&path) {
Ok(tokens) => content.add(path, tokens),
Err(e) => eprintln!("Skipping a file {}: {}", path.display(), e),
}
}
println!(
"Indexed {} tokens in {} files.",
content.tokens().count(),
content.sources.len()
);
// Run the query UI loop
let mut query = String::new();
loop {
query.clear();
println!("Enter search query:");
if stdin().read_line(&mut query).is_err() || query.trim().is_empty() {
break;
}
match query.trim() {
"/exit" | "/quit" | "" => break,
"/tokens" => {
println!("Tokens:");
for token in content.tokens() {
println!("{}", token);
}
println!();
}
"/files" => {
println!("Sources:");
for source in content.sources() {
println!("{}", source.display());
}
println!();
}
_ => {
let query = tokenize_query(&query);
println!();
println!("Found hits:");
print_hits(content.search(query.iter()).map(|it| it.display()));
println!("Found file names:");
print_hits(files.search(query.iter()).map(|it| it.display()));
println!();
}
}
}
Ok(())
}</lang>
{{out}}
<pre>C:\Temp>inverted_index books html htm txt
Indexing {"htm", "html", "txt"} files in 'books'
Indexed 34810 tokens in 9 files.
Enter search query:
war
Found hits:
[1] books\EN\C\Chaucer, Geoffrey - Canterbury Tales.txt
[2] books\EN\H\Homer - The Odyssey.txt
[3] books\EN\O\Orwell, George - 1984.html
[4] books\EN\W\Wells, Herbert George - The Invisible Man.txt
[5] books\EN\W\Wells, Herbert George - The Island of Doctor Moreau.txt
[6] books\EN\W\Wells, Herbert George - The Time Machine.txt
[7] books\EN\W\Wells, Herbert George - War of the Worlds.txt
Found file names:
[1] books\EN\W\Wells, Herbert George - War of the Worlds.txt
Enter search query:
war gun
Found hits:
[1] books\EN\O\Orwell, George - 1984.html
[2] books\EN\W\Wells, Herbert George - War of the Worlds.txt
Found file names:
(none)
Enter search query:
/exit
C:\Temp></pre>
=={{header|Scala}}==
Line 2,934 ⟶ 3,209:
array set localidx {}
foreach word [wordsInString $data] {
}
# Transcribe into global index
foreach {word places} [array get localidx] {
}
}
Line 2,948 ⟶ 3,223:
global index
if {[info exists index($word)]} {
}
}
Line 2,956 ⟶ 3,231:
set files [findFilesForWord [lindex $words 0]]
foreach w [lrange $words 1 end] {
}
}
return $files
Line 2,971 ⟶ 3,246:
set files {}
foreach w $words {
}
}
dict for {file places} $index([lindex $words 0]) {
}
}
}
}
}
return $files
Line 3,014 ⟶ 3,289:
pack [entry .found.entry -textvariable terms] -fill x
pack [button .found.findAll -command FindAll \
pack [button .found.findSeq -command FindSeq \
# The actions invoked by various GUI buttons
Line 3,023 ⟶ 3,298:
set f [tk_getOpenFile]
if {$f ne ""} {
}
}
Line 3,032 ⟶ 3,307:
set fs [findFilesWithAllWords $words]
lappend found "Searching for files with all $terms" {*}$fs \
}
proc FindSeq {} {
Line 3,039 ⟶ 3,314:
set fs [findFilesWithWordSequence $words]
lappend found "Searching for files with \"$terms\"" {*}$fs \
}</lang>
Line 3,139 ⟶ 3,414:
# the record separator for $INDEX/all.tab).
for file in "$@"; do
}
done
Line 3,153 ⟶ 3,428:
fi=1
for file in "$@"; do
done</lang>
Line 3,188 ⟶ 3,463:
want=sequence
while getopts aos name; do
done
shift $((OPTIND - 1))
all() {
}
one() {
}
sequence() {
}
|