Yahoo! search interface

From Rosetta Code
Revision as of 21:05, 14 May 2009 by rosettacode>Glennj (→‎{{header|Tcl}}: add example using OO and an html parser)
Task
Yahoo! search interface
You are encouraged to solve this task according to the task description, using any language you may know.

Create a class for searching Yahoo results. It must implement a Next Page method, and read URL, Title and Content from results.

C#

<lang csharp>using System; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Collections; using System.Collections.Generic; using System.Linq;

class YahooSearch {

   private string query;
   private string content;
   private int page = 1;
   public YahooSearch(string query) {
       this.query = query;
       this.content = new WebClient().DownloadString("http://search.yahoo.com/search?p=" + query);
   }
   public YahooSearch(string query, int page) {
       this.query = query;
       this.page = page;
       this.content = new WebClient().DownloadString(String.Format("http://search.yahoo.com/search?p={0}&b={1}", query, ((this.page - 1) * 10) + 1));
   }
   string Fix(string x) {
       x = x.Replace("", "").Replace("", "").Replace("", "").Replace("", "").Replace("...", "");

int i = x.IndexOf("</a>");

       if (i > 0) return x.Substring(0, i);
       else return x;        
   } 
   public YahooResult[] Results {
       get {
           ArrayList results = new ArrayList();

foreach (Match e in new Regex("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>(?(

.+?

))

(.+?)

(.+?)").Matches(this.content)) {

               string rurl = Fix(e.Groups[3].Value);
               string rtitle = Fix(e.Groups[1].Value);
               string rcontent = Fix(e.Groups[2].Value);
               
               results.Add(new YahooResult(rurl, rtitle, rcontent));
           }
           return (YahooResult[])results.ToArray(typeof(YahooResult));
       }
   }
   public YahooSearch NextPage() {
       return new YahooSearch(this.query, this.page + 1);
   }
   public YahooSearch GetPage(int page) {
       return new YahooSearch(this.query, page);
   }

}

class YahooResult {

   public string URL { get; set; }
   public string Title { get; set; }
   public string Content { get; set; }
   public YahooResult(string url, string title, string content) {
       this.URL = url;
       this.Title = title;
       this.Content = content;
   }

}

// Usage:

class Prog {

   static void Main() {
       YahooSearch x = new YahooSearch("test");
       foreach (YahooResult result in x.Results) {
           Console.WriteLine(result.Title);
       }
   }

}</lang>

Perl

<lang perl>package YahooSearch;

use Encode; use HTTP::Cookies; use WWW::Mechanize;

  1. --- Internals -------------------------------------------------

sub apply (&$)

{my $f = shift; local $_ = shift; $f->(); return $_;}
  1. We construct a cookie to get 100 results per page and prevent
  2. "enhanced results".

my $search_prefs = 'v=1&n=100&sm=' .

   apply {s/([^a-zA-Z0-9])/sprintf '%%%02X', ord $1/ge}
   join '|',
   map {'!' . $_}
   qw(hsb Zq0 XbM sss dDO VFM RQh uZ0 Fxe yCl GP4 FZK yNC mEG niH);

my $cookies = HTTP::Cookies->new; $cookies->set_cookie(0, 'sB', $search_prefs, '/', 'search.yahoo.com');

my $mech = new WWW::Mechanize

  (cookie_jar => $cookies,
   stack_depth => 0);

sub read_page

{my ($next, $page, @results) =
    ($mech->find_link(text => 'Next >')->url,
     decode 'iso-8859-1', $mech->content);
 while ($page =~ m

{

<a \s class="yschttl \s spt" \s href=" ([^"]+) " \s* > #" (.+?) </a> .+?
(.+?)
}xg)
    {push @results, {url => $1, title => $2, content => $3};
     foreach ( @{$results[-1]}{qw(title content)} )
        {s/<.+?>//g;
         $_ = encode 'utf8', $_;}}
 return $next, \@results;}
  1. --- Methods ---------------------------------------------------

sub new

{my $invocant = shift;
 my $class = ref($invocant) || $invocant;
 $mech->get('http://search.yahoo.com/search?p=' . apply
    {s/([^a-zA-Z0-9 ])/sprintf '%%%02X', ord $1/ge;
     s/ /+/g;}
   shift);
 my ($next, $results) = read_page();
 return bless {link_to_next => $next, results => $results}, $class;}

sub results

{@{shift()->{results}};}

sub next_page

{my $invocant = shift;
 my $next = $invocant->{link_to_next};
 unless ($next)
    {$invocant->{results} = [];
     return undef;}
 $mech->get($next);
 ($next, my $results) = read_page();
 $invocant->{link_to_next} = $next;
 $invocant->{results} = $results;
 return 1;}</lang>

Python

<lang python>import urllib import re

def fix(x):

   x =  x.replace("","").replace("","").replace("","").replace("","").replace("...","")
return x[:x.find("</a>

")]

class YahooSearch:

   def __init__(self, query, page=1):       
       self.query = query
       self.page = page
       self.url = "http://search.yahoo.com/search?p=%s&b=%s" %(self.query, ((self.page - 1) * 10 + 1))
       self.content = urllib.urlopen(self.url).read()        

   def getresults(self):
       self.results = []

for i in re.findall("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>"+\ "

(.+?)

(.+?)",self.content):

           title = fix(i[0])
           content = fix(i[1])
           url = fix(i[2])

           self.results.append(YahooResult(title, content, url))

       return self.results

   def getnextpage(self):
       return YahooSearch(self.query, self.page+1)

   results = property(fget=getresults)
   nextpage = property(fget=getnextpage)

class YahooResult:

   def __init__(self,title,content,url):
       self.title = title
       self.content = content
       self.url = url

  1. Usage:

x = YahooSearch("test")

for result in x.results:

   print result.title</lang>

Tcl

Translation of: Python
This example is incorrect. Please fix the code and remove this message.

Details: This example is not implementing a Next Page method.

<lang tcl>package require http

proc fix s {

   string map {... ""  ""  ""  "" "" ""} \

[regsub "</a>.*" $s ""]

} proc YahooSearch {term {page 1}} {

   # Build the (ugly) scraper URL

append re {<a class="yschttl spt" href=".+?" >(.+?)</a>} append re {

(.+?)} append re {

(.+?)}

   # Perform the query; note that this handles special characters
   # in the query term correctly
   set q [http::formatQuery p $term b [expr {$page*10-9}]]
   set token [http::geturl http://search.yahoo.com/search?$q]
   set data [http::data $token]
   http::cleanup $token
   # Assemble the results into a nice list
   set results {}
   foreach {- title content url} [regexp -all -inline $re $data] {
       lappend results [fix $title] [fix $content] [fix $url]
   }
   return $results

}

  1. Usage:

foreach {title content url} [YahooSearch "test"] {

   puts $title

}</lang> Tcl 8.6 can use its coroutine support to produce an iterator more like that Python code: <lang tcl>proc yahoo! term {

   coroutine yahoo![incr ::yahoo] apply {term {
       yield [info coroutine]
       while 1 {
           set results [YahooSearch $term [incr step]]
           if {[llength $results] == 0} {
               return -code break
           }
           foreach {t c u} $results {
               yield [dict create title $t content $c url $u]
           }
       }
   }} $term

}

  1. test...

set it [yahoo! "test"] while 1 {

   puts [dict get [$it] title]
   after 300  ;# Slow the code down... :-)

}</lang>

Another approach: uses a class as specified in the task. Also, uses an html parser (parsing html with regular expressions is a particular annoyance of mine).

Works with: Tcl version 8.6

<lang tcl>package require Tcl 8.6 package require http package require htmlparse package require textutil::adjust

oo::class create yahoosearch {

   method search {s} {
       my variable searchterm page baseurl
       set searchterm $s
       set page 1
       set baseurl {http://search.yahoo.com/search}
   }
   method getresults {} {
       my variable state results current_data
       set results [list]
       set current_data [dict create]
       set state looking_for_results
       htmlparse::parse -cmd [list [self] html_parser_callback] [my gethtml]
       lappend results $current_data
   }
   method nextpage {} {
       my variable page
       incr page 10
       my getresults
   }
   
   method nextresult {} {
       my variable results page
       if { ! [info exists results]} {
           my getresults
       } elseif {[llength $results] == 0} {
           my nextpage
       }
       set results [lassign $results result]
       return $result
   }
   method gethtml {} {
       my variable searchterm page baseurl
       set url [format {%s?%s} $baseurl [::http::formatQuery p $searchterm b $page]]
       set response [http::geturl $url]
       set html [http::data $response]
       http::cleanup $response
       return $html
   }
   method html_parser_callback {tag slash param textBehindTheTag} {
       my variable state results current_data
       switch -exact -- $state {
           looking_for_results {
               if {$tag eq "div" && [string first {id="main"} $param] != -1} {
                   set state ready
               }
           }
           ready {
               if {$tag eq "div" && [string first {class="res} $param] != -1} {
                   if {[dict size $current_data] > 0} {lappend results $current_data}
                   set current_data [dict create]
                   set state getting_url
               }
           }
           getting_url {
               if {$tag eq "a" && [string match "*yschttl spt*" $param]} {
                   if {[regexp {href="(.+?)"} $param - url]} {
                       dict set current_data url $url
                   } else {
                       dict set current_data url "no href in tag params: '$param'"
                   }
                   dict set current_data title $textBehindTheTag
                   set state getting_title
               }
           }
           getting_title {
               if {$tag eq "a" && $slash eq "/"} {
                   set state looking_for_abstract
               } else {
                   dict append current_data title $textBehindTheTag
               }
           }
           looking_for_abstract {
               if {$tag eq "span" && [string first {class="url} $param] != -1} {
                   set state ready
               } elseif {$tag eq "div" && [string first {class="abstr} $param] != -1} {
                   dict set current_data abstract $textBehindTheTag
                   set state getting_abstract
               }
           }
           getting_abstract {
               if {$tag eq "div" && $slash eq "/"} {
                   set state ready
               } else {
                   dict append current_data abstract $textBehindTheTag
               }
           }
       }
   }

}

yahoosearch create searcher searcher search "search text here"

for {set x 1} {$x <= 15} {incr x} {

   set result [searcher nextresult]
   dict with result {
       puts $title
       puts $url
       puts [textutil::adjust::indent [textutil::adjust::adjust $abstract] "  "]
       puts ""
   }

}</lang>