Yahoo! search interface: Difference between revisions

From Rosetta Code
Content added Content deleted
m (→‎{{header|Tcl}}: link to tcllib library)
(+ AutoHotkey)
Line 3: Line 3:
Create a class for searching Yahoo results.
Create a class for searching Yahoo results.
It must implement a '''Next Page''' method, and read URL, Title and Content from results.
It must implement a '''Next Page''' method, and read URL, Title and Content from results.
=={{header|AutoHotkey}}==
translated from python example
<lang AutoHotkey>
yahooSearch("test", 1)
yahooSearch("test", 2)


yahooSearch(query, page)
{
global
start := ((page - 1) * 10) + 1
filedelete, search.txt
urldownloadtofile, % "http://search.yahoo.com/search?p=%" . query
. "&b=%" . start, search.txt
fileread, content, search.txt
reg = <a class="yschttl spt" href=".+?" >(.+?)</a></h3></div><div class="abstr">(.+?)</div><span class=url>(.+?)</span>
index := found := 1
while (found := regexmatch(content, reg, self, found + 1))
{
msgbox % title%A_Index% := fix(self1)
content%A_Index% := fix(self2)
url%A_Index% := fix(self3)
}
}


fix(url)
{
stringreplace, url, url, <b>, ,All
stringreplace, url, url, </b>, ,All
stringreplace, url, url, <wbr />, ,All
stringreplace, url, url, <wbr>, ,All
stringreplace, url, url, <b>...</b>, ,All
if pos := instr(url, "</a></h3></div>")
StringLeft, url, url, pos
return url
}
</lang>
=={{header|C sharp|C#}}==
=={{header|C sharp|C#}}==



Revision as of 01:57, 11 June 2009

Task
Yahoo! search interface
You are encouraged to solve this task according to the task description, using any language you may know.

Create a class for searching Yahoo results. It must implement a Next Page method, and read URL, Title and Content from results.

AutoHotkey

translated from python example <lang AutoHotkey> yahooSearch("test", 1) yahooSearch("test", 2)

yahooSearch(query, page) {

 global
 start := ((page - 1) * 10) + 1
 filedelete, search.txt
 urldownloadtofile, % "http://search.yahoo.com/search?p=%" . query
 . "&b=%" . start, search.txt
 fileread, content, search.txt

reg = <a class="yschttl spt" href=".+?" >(.+?)</a>

(.+?)

(.+?)

 index := found := 1
 while (found := regexmatch(content, reg, self, found + 1))
 {
   msgbox % title%A_Index% := fix(self1)
   content%A_Index% := fix(self2)
   url%A_Index% := fix(self3)
 }

}


fix(url) { stringreplace, url, url, , ,All stringreplace, url, url, , ,All stringreplace, url, url, , ,All stringreplace, url, url, , ,All stringreplace, url, url, ..., ,All

if pos := instr(url, "</a>")

StringLeft, url, url, pos return url } </lang>

C#

<lang csharp>using System; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Collections; using System.Collections.Generic; using System.Linq;

class YahooSearch {

   private string query;
   private string content;
   private int page = 1;
   public YahooSearch(string query) {
       this.query = query;
       this.content = new WebClient().DownloadString("http://search.yahoo.com/search?p=" + query);
   }
   public YahooSearch(string query, int page) {
       this.query = query;
       this.page = page;
       this.content = new WebClient().DownloadString(String.Format("http://search.yahoo.com/search?p={0}&b={1}", query, ((this.page - 1) * 10) + 1));
   }
   string Fix(string x) {
       x = x.Replace("", "").Replace("", "").Replace("", "").Replace("", "").Replace("...", "");

int i = x.IndexOf("</a>");

       if (i > 0) return x.Substring(0, i);
       else return x;        
   } 
   public YahooResult[] Results {
       get {
           ArrayList results = new ArrayList();

foreach (Match e in new Regex("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>(?(

.+?

))

(.+?)

(.+?)").Matches(this.content)) {

               string rurl = Fix(e.Groups[3].Value);
               string rtitle = Fix(e.Groups[1].Value);
               string rcontent = Fix(e.Groups[2].Value);
               
               results.Add(new YahooResult(rurl, rtitle, rcontent));
           }
           return (YahooResult[])results.ToArray(typeof(YahooResult));
       }
   }
   public YahooSearch NextPage() {
       return new YahooSearch(this.query, this.page + 1);
   }
   public YahooSearch GetPage(int page) {
       return new YahooSearch(this.query, page);
   }

}

class YahooResult {

   public string URL { get; set; }
   public string Title { get; set; }
   public string Content { get; set; }
   public YahooResult(string url, string title, string content) {
       this.URL = url;
       this.Title = title;
       this.Content = content;
   }

}

// Usage:

class Prog {

   static void Main() {
       YahooSearch x = new YahooSearch("test");
       foreach (YahooResult result in x.Results) {
           Console.WriteLine(result.Title);
       }
   }

}</lang>

Perl

<lang perl>package YahooSearch;

use Encode; use HTTP::Cookies; use WWW::Mechanize;

  1. --- Internals -------------------------------------------------

sub apply (&$)

{my $f = shift; local $_ = shift; $f->(); return $_;}
  1. We construct a cookie to get 100 results per page and prevent
  2. "enhanced results".

my $search_prefs = 'v=1&n=100&sm=' .

   apply {s/([^a-zA-Z0-9])/sprintf '%%%02X', ord $1/ge}
   join '|',
   map {'!' . $_}
   qw(hsb Zq0 XbM sss dDO VFM RQh uZ0 Fxe yCl GP4 FZK yNC mEG niH);

my $cookies = HTTP::Cookies->new; $cookies->set_cookie(0, 'sB', $search_prefs, '/', 'search.yahoo.com');

my $mech = new WWW::Mechanize

  (cookie_jar => $cookies,
   stack_depth => 0);

sub read_page

{my ($next, $page, @results) =
    ($mech->find_link(text => 'Next >')->url,
     decode 'iso-8859-1', $mech->content);
 while ($page =~ m

{

<a \s class="yschttl \s spt" \s href=" ([^"]+) " \s* > #" (.+?) </a> .+?
(.+?)
}xg)
    {push @results, {url => $1, title => $2, content => $3};
     foreach ( @{$results[-1]}{qw(title content)} )
        {s/<.+?>//g;
         $_ = encode 'utf8', $_;}}
 return $next, \@results;}
  1. --- Methods ---------------------------------------------------

sub new

{my $invocant = shift;
 my $class = ref($invocant) || $invocant;
 $mech->get('http://search.yahoo.com/search?p=' . apply
    {s/([^a-zA-Z0-9 ])/sprintf '%%%02X', ord $1/ge;
     s/ /+/g;}
   shift);
 my ($next, $results) = read_page();
 return bless {link_to_next => $next, results => $results}, $class;}

sub results

{@{shift()->{results}};}

sub next_page

{my $invocant = shift;
 my $next = $invocant->{link_to_next};
 unless ($next)
    {$invocant->{results} = [];
     return undef;}
 $mech->get($next);
 ($next, my $results) = read_page();
 $invocant->{link_to_next} = $next;
 $invocant->{results} = $results;
 return 1;}</lang>

Python

<lang python>import urllib import re

def fix(x):

   x =  x.replace("","").replace("","").replace("","").replace("","").replace("...","")
return x[:x.find("</a>

")]

class YahooSearch:

   def __init__(self, query, page=1):       
       self.query = query
       self.page = page
       self.url = "http://search.yahoo.com/search?p=%s&b=%s" %(self.query, ((self.page - 1) * 10 + 1))
       self.content = urllib.urlopen(self.url).read()        

   def getresults(self):
       self.results = []

for i in re.findall("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>"+\ "

(.+?)

(.+?)",self.content):

           title = fix(i[0])
           content = fix(i[1])
           url = fix(i[2])

           self.results.append(YahooResult(title, content, url))

       return self.results

   def getnextpage(self):
       return YahooSearch(self.query, self.page+1)

   results = property(fget=getresults)
   nextpage = property(fget=getnextpage)

class YahooResult:

   def __init__(self,title,content,url):
       self.title = title
       self.content = content
       self.url = url

  1. Usage:

x = YahooSearch("test")

for result in x.results:

   print result.title</lang>

Tcl

Translation of: Python

<lang tcl>package require http

proc fix s {

   string map {... ""  ""  ""  "" "" ""} \

[regsub "</a>.*" $s ""]

} proc YahooSearch {term {page 1}} {

   # Build the (ugly) scraper URL

append re {<a class="yschttl spt" href=".+?" >(.+?)</a>} append re {

(.+?)} append re {

(.+?)}

   # Perform the query; note that this handles special characters
   # in the query term correctly
   set q [http::formatQuery p $term b [expr {$page*10-9}]]
   set token [http::geturl http://search.yahoo.com/search?$q]
   set data [http::data $token]
   http::cleanup $token
   # Assemble the results into a nice list
   set results {}
   foreach {- title content url} [regexp -all -inline $re $data] {
       lappend results [fix $title] [fix $content] [fix $url]
   }
   # set up the call for the next page
   interp alias {} Nextpage {} YahooSearch $term [incr page]
   return $results

}

  1. Usage: get the first two pages of results

foreach {title content url} [YahooSearch "test"] {

   puts $title

} foreach {title content url} [Nextpage] {

    puts $title

}</lang>

Works with: Tcl version 8.6

With Tcl 8.6, more options are available for managing the global state, through objects and coroutines. First, an object-based solution that takes the basic YahooSearch functionality and dresses it up to be more Tcl-like: <lang tcl>package require Tcl 8.6

oo::class create WebSearcher {

   variable page term results
   constructor searchTerm {
       set page 0
       set term $searchTerm
       my nextPage
   }
   # This next method *is* a very Tcl-ish way of doing iteration.
   method for {titleVar contentsVar urlVar body} {
       upvar 1 $titleVar t $contentsVar c $urlVar v
       foreach {t c v} $results {
           uplevel 1 $body
       }
   }
   # Reuse the previous code for simplicity rather than writing it anew
   # Of course, if we were serious about this, we'd put the code here properly
   method nextPage {} {
       set results [YahooSearch $term [incr page]]
       return
   }

}

  1. How to use. Note the 'foreach' method use below; new "keywords" as methods!

set ytest [WebSearcher new "test"] $ytest for title - url {

   puts "\"$title\" : $url"

} $ytest nextPage $ytest for title - url {

   puts "\"$title\" : $url"

} $ytest delete ;# standard method that deletes the object</lang> However, the paradigm of an iterator is also interesting and is more appropriately supported through a coroutine. This version conceals the fact that the service produces output in pages; care should be taken with it because it can produce rather a lot of network traffic... <lang tcl>package require Tcl 8.6

proc yahoo! term {

   coroutine yahoo![incr ::yahoo] apply {term {
       yield [info coroutine]
       while 1 {
           set results [YahooSearch $term [incr step]]
           if {[llength $results] == 0} {
               return -code break
           }
           foreach {t c u} $results {
               yield [dict create title $t content $c url $u]
           }
       }
   }} $term

}

  1. test by getting first fifty titles...

set it [yahoo! "test"] for {set i 50} {$i>0} {incr i -1} {

   puts [dict get [$it] title]
   after 300  ;# Slow the code down... :-)

}</lang>

Another approach: uses a class as specified in the task. Also, uses an html parser from

Library: tcllib

(parsing html with regular expressions is a particular annoyance of mine).

Works with: Tcl version 8.6

<lang tcl>package require Tcl 8.6 package require http package require htmlparse package require textutil::adjust

oo::class create yahoosearch {

   method search {s} {
       my variable searchterm page baseurl
       set searchterm $s
       set page 1
       set baseurl {http://ca.search.yahoo.com/search}
   }
   method getresults {} {
       my variable state results current_data
       set results [list]
       set current_data [dict create]
       set state looking_for_results
       htmlparse::parse -cmd [list [self] html_parser_callback] [my gethtml]
   }
   method nextpage {} {
       my variable page
       incr page 10
       my getresults
   }
   
   method nextresult {} {
       my variable results page
       if { ! [info exists results]} {
           my getresults
       } elseif {[llength $results] == 0} {
           my nextpage
       }
       set results [lassign $results result]
       return $result
   }
   method gethtml {} {
       my variable searchterm page baseurl
       set url [format {%s?%s} $baseurl [::http::formatQuery p $searchterm b $page]]
       set response [http::geturl $url]
       set html [http::data $response]
       http::cleanup $response
       return $html
   }
   method html_parser_callback {tag slash param textBehindTheTag} {
       my variable state results current_data
       switch -exact -- $state {
           looking_for_results {
               if {$tag eq "div" && [string first {id="main"} $param] != -1} {
                   set state ready
               }
           }
           ready {
               if {($tag eq "div" && [string first {class="res} $param] != -1) ||
                   ($tag eq "html" && $slash eq "/")
               } {
                   if {[dict size $current_data] > 0} {lappend results $current_data}
                   set current_data [dict create]
                   set state getting_url
               }
           }
           getting_url {
               if {$tag eq "a" && [string match "*yschttl spt*" $param]} {
                   if {[regexp {href="(.+?)"} $param - url]} {
                       dict set current_data url $url
                   } else {
                       dict set current_data url "no href in tag params: '$param'"
                   }
                   dict set current_data title $textBehindTheTag
                   set state getting_title
               }
           }
           getting_title {
               if {$tag eq "a" && $slash eq "/"} {
                   set state looking_for_abstract
               } else {
                   dict append current_data title $textBehindTheTag
               }
           }
           looking_for_abstract {
               if {$tag eq "span" && [string first {class="url} $param] != -1} {
                   set state ready
               } elseif {$tag eq "div" && [string first {class="abstr} $param] != -1} {
                   dict set current_data abstract $textBehindTheTag
                   set state getting_abstract
               }
           }
           getting_abstract {
               if {$tag eq "div" && $slash eq "/"} {
                   set state ready
               } else {
                   dict append current_data abstract $textBehindTheTag
               }
           }
       }
   }

}

yahoosearch create searcher searcher search "search text here"

for {set x 1} {$x <= 15} {incr x} {

   set result [searcher nextresult]
   dict with result {
       puts $title
       puts $url
       puts [textutil::adjust::indent [textutil::adjust::adjust $abstract] "  "]
       puts ""
   }

}</lang>