Yahoo! search interface: Difference between revisions

Content added Content deleted

Inline

Revision as of 19:50, 25 August 2009

Create a class for searching Yahoo results. It must implement a Next Page method, and read URL, Title and Content from results.

AutoHotkey

translated from python example <lang AutoHotkey> test: yahooSearch("test", 1) yahooSearch("test", 2) return

yahooSearch(query, page) {

 global
 start := ((page - 1) * 10) + 1
 filedelete, search.txt
 urldownloadtofile, % "http://search.yahoo.com/search?p=" . query
 . "&b=" . start, search.txt
 fileread, content, search.txt

reg = <a class="yschttl spt" href=".+?" >(.+?)</a>

(.+?)

 index := found := 1
 while (found := regexmatch(content, reg, self, found + 1))
 {
   msgbox % title%A_Index% := fix(self1)
   content%A_Index% := fix(self2)
   url%A_Index% := fix(self3)
 }

}

fix(url) {

if pos := instr(url, "</a>")

StringLeft, url, url, pos - 1 url := regexreplace(url, "<.*?>") return url }

</lang>

Perl

<lang perl>package YahooSearch;

use Encode; use HTTP::Cookies; use WWW::Mechanize;

--- Internals -------------------------------------------------

sub apply (&$)

{my $f = shift; local $_ = shift; $f->(); return $_;}

We construct a cookie to get 100 results per page and prevent
"enhanced results".

my $search_prefs = 'v=1&n=100&sm=' .

   apply {s/([^a-zA-Z0-9])/sprintf '%%%02X', ord $1/ge}
   join '|',
   map {'!' . $_}
   qw(hsb Zq0 XbM sss dDO VFM RQh uZ0 Fxe yCl GP4 FZK yNC mEG niH);

my $cookies = HTTP::Cookies->new; $cookies->set_cookie(0, 'sB', $search_prefs, '/', 'search.yahoo.com');

my $mech = new WWW::Mechanize

  (cookie_jar => $cookies,
   stack_depth => 0);

sub read_page

{my ($next, $page, @results) =
    ($mech->find_link(text => 'Next >')->url,
     decode 'iso-8859-1', $mech->content);
 while ($page =~ m

{

<a \s class="yschttl \s spt" \s href=" ([^"]+) " \s* > #" (.+?) </a> .+?

(.+?)

}xg)

    {push @results, {url => $1, title => $2, content => $3};
     foreach ( @{$results[-1]}{qw(title content)} )
        {s/<.+?>//g;
         $_ = encode 'utf8', $_;}}
 return $next, \@results;}

--- Methods ---------------------------------------------------

sub new

{my $invocant = shift;
 my $class = ref($invocant) || $invocant;
 $mech->get('http://search.yahoo.com/search?p=' . apply
    {s/([^a-zA-Z0-9 ])/sprintf '%%%02X', ord $1/ge;
     s/ /+/g;}
   shift);
 my ($next, $results) = read_page();
 return bless {link_to_next => $next, results => $results}, $class;}

sub results

{@{shift()->{results}};}

sub next_page

{my $invocant = shift;
 my $next = $invocant->{link_to_next};
 unless ($next)
    {$invocant->{results} = [];
     return undef;}
 $mech->get($next);
 ($next, my $results) = read_page();
 $invocant->{link_to_next} = $next;
 $invocant->{results} = $results;
 return 1;}</lang>

Python

This example is incorrect. It does not accomplish the given task. Please fix the code and remove this message.

<lang python>import urllib import re

def fix(x):

   x =  x.replace("","").replace("","").replace("","").replace("","").replace("...","")

return x[:x.find("</a>

")]

class YahooSearch:

   def __init__(self, query, page=1):       
       self.query = query
       self.page = page
       self.url = "http://search.yahoo.com/search?p=%s&b=%s" %(self.query, ((self.page - 1) * 10 + 1))
       self.content = urllib.urlopen(self.url).read()        

   def getresults(self):
       self.results = []

for i in re.findall("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>"+\ "

(.+?)

(.+?)",self.content):

           title = fix(i[0])
           content = fix(i[1])
           url = fix(i[2])

           self.results.append(YahooResult(title, content, url))

       return self.results

   def getnextpage(self):
       return YahooSearch(self.query, self.page+1)

   results = property(fget=getresults)
   nextpage = property(fget=getnextpage)

class YahooResult:

   def __init__(self,title,content,url):
       self.title = title
       self.content = content
       self.url = url

Usage:

x = YahooSearch("test")

for result in x.results:

   print result.title</lang>

Rather than using regexes to find the content (like some of the other solutions here) this method parses the HMTL and finds the appropriate sections. <lang R> YahooSearch <- function(query, page=1, .opts=list(), ignoreMarkUpErrors=TRUE) {

  if(!require(RCurl) || !require(XML))
  {
     stop("Could not load required packages")
  }   
  
  # Replace " " with "%20", etc
  query <- curlEscape(query)
  
  # Retrieve page
  b <- 10*(page-1)+1
  theurl <- paste("http://uk.search.yahoo.com/search?p=",
     query, "&b=", b, sep="")
  webpage <- getURL(theurl, .opts=.opts)
  
  # Save search for nextpage function
  .Search <- list(query=query, page=page, .opts=.opts, 
     ignoreMarkUpErrors=ignoreMarkUpErrors)
  assign(".Search", .Search, envir=globalenv())
    
  # Parse HTML; retrieve results block
  webpage <- readLines(tc <- textConnection(webpage)); close(tc)
  if(ignoreMarkUpErrors)
  {
     pagetree <- htmlTreeParse(webpage, error=function(...){})     
  } else
  {
     pagetree <- htmlTreeParse(webpage)
  }
  
  
  findbyattr <- function(x, id, type="id")
  {
     ids <- sapply(x, function(x) x$attributes[type])
     x[ids==id]   
  }
     
  body <- pagetree$children$html$children$body
  bd <- findbyattr(body$children$div$children, "bd")
  left <- findbyattr(bd$div$children$div$children, "left")
  web <- findbyattr(left$div$children$div$children, "web") 
  resol <- web$div$children$ol
  
  #Get url, title, content from results
  gettextfromnode <- function(x)
  {
     un <- unlist(x$children)  
     paste(un[grep("value", names(un))], collapse=" ") 
  }
  
  n <- length(resol)
  results <- list()
  length(results) <- n
  for(i in 1:n)
  {
     mainlink <- resoli$children$div$children[1]$div$children$h3$children$a
     url <- mainlink$attributes["href"]
     title <- gettextfromnode(mainlink)
     
     contenttext <- findbyattr(resoli$children$div$children[2], "abstr", type="class")      
     if(length(contenttext)==0)
     {
         contenttext <- findbyattr(resoli$children$div$children[2]$div$children$div$children, 
           "sm-abs", type="class")
     }
     
     content <- gettextfromnode(contenttext$div)        
     resultsi <- list(url=url, title=title, content=content)                                                        
  }
  names(results) <- as.character(seq(b, b+n-1)) 
  results

}

nextpage <- function() {

  if(exists(".Search", envir=globalenv())) 
  {
     .Search <- get(".Search", envir=globalenv())   
     .Search$page  <- .Search$page + 1L
     do.call(YahooSearch, .Search)
  } else
  {
     message("No search has been performed yet")
  }

}

Usage

YahooSearch("rosetta code") nextpage() </lang>

Ruby

Uses

Library: RubyGems

Library: Hpricot

to parse the HTML. Someone more skillful than I at XPath or CSS could tighten up the parse_html method.

<lang ruby>require 'open-uri' require 'hpricot'

SearchResult = Struct.new(:url, :title, :content)

class SearchYahoo

 @@urlinfo = [nil, 'ca.search.yahoo.com', 80, '/search', nil, nil]

 def initialize(term)
   @term = term
   @page = 1
   @results = nil
   @url = URI::HTTP.build(@@urlinfo)
 end

 def next_result
   if not @results
     @results = []
     fetch_results
   elsif @results.empty?
     next_page
   end
   @results.shift
 end

 def fetch_results
   @url.query = URI.escape("p=%s&b=%d" % [@term, @page])
   doc = open(@url) { |f| Hpricot(f) }  
   parse_html(doc)
 end

 def next_page
   @page += 10
   fetch_results
 end

 def parse_html(doc)
   doc.search("div#main").search("div").each do |div|
     next unless div.has_attribute?("class") and div.get_attribute("class").index("res") == 0
     result = SearchResult.new
     div.search("a").each do |link|
       next unless link.has_attribute?("class") and link.get_attribute("class") == "yschttl spt"
       result.url = link.get_attribute("href")
       result.title = link.inner_text
     end
     div.search("div").each do |abstract|
       next unless abstract.has_attribute?("class") and abstract.get_attribute("class").index("abstr")
       result.content = abstract.inner_text
     end
     @results << result
   end
 end

end

s = SearchYahoo.new("test") 15.times do |i|

 result = s.next_result
 puts i+1
 puts result.title
 puts result.url
 puts result.content
 puts

end</lang>

Tcl

Translation of: Python

<lang tcl>package require http

proc fix s {

   string map {... ""  ""  ""  "" "" ""} \

[regsub "</a>.*" $s ""]

} proc YahooSearch {term {page 1}} {

   # Build the (ugly) scraper URL

append re {<a class="yschttl spt" href=".+?" >(.+?)</a>} append re {

(.+?)} append re {

(.+?)}

   # Perform the query; note that this handles special characters
   # in the query term correctly
   set q [http::formatQuery p $term b [expr {$page*10-9}]]
   set token [http::geturl http://search.yahoo.com/search?$q]
   set data [http::data $token]
   http::cleanup $token

   # Assemble the results into a nice list
   set results {}
   foreach {- title content url} [regexp -all -inline $re $data] {
       lappend results [fix $title] [fix $content] [fix $url]
   }

   # set up the call for the next page
   interp alias {} Nextpage {} YahooSearch $term [incr page]

   return $results

}

Usage: get the first two pages of results

foreach {title content url} [YahooSearch "test"] {

   puts $title

} foreach {title content url} [Nextpage] {

    puts $title

}</lang>

Works with: Tcl version 8.6

With Tcl 8.6, more options are available for managing the global state, through objects and coroutines. First, an object-based solution that takes the basic YahooSearch functionality and dresses it up to be more Tcl-like: <lang tcl>package require Tcl 8.6

oo::class create WebSearcher {

   variable page term results
   constructor searchTerm {
       set page 0
       set term $searchTerm
       my nextPage
   }
   # This next method *is* a very Tcl-ish way of doing iteration.
   method for {titleVar contentsVar urlVar body} {
       upvar 1 $titleVar t $contentsVar c $urlVar v
       foreach {t c v} $results {
           uplevel 1 $body
       }
   }
   # Reuse the previous code for simplicity rather than writing it anew
   # Of course, if we were serious about this, we'd put the code here properly
   method nextPage {} {
       set results [YahooSearch $term [incr page]]
       return
   }

}

How to use. Note the 'foreach' method use below; new "keywords" as methods!

set ytest [WebSearcher new "test"] $ytest for title - url {

   puts "\"$title\" : $url"

} $ytest nextPage $ytest for title - url {

   puts "\"$title\" : $url"

} $ytest delete ;# standard method that deletes the object</lang> However, the paradigm of an iterator is also interesting and is more appropriately supported through a coroutine. This version conceals the fact that the service produces output in pages; care should be taken with it because it can produce rather a lot of network traffic... <lang tcl>package require Tcl 8.6

proc yahoo! term {

   coroutine yahoo![incr ::yahoo] apply {term {
       yield [info coroutine]
       while 1 {
           set results [YahooSearch $term [incr step]]
           if {[llength $results] == 0} {
               return -code break
           }
           foreach {t c u} $results {
               yield [dict create title $t content $c url $u]
           }
       }
   }} $term

}

test by getting first fifty titles...

set it [yahoo! "test"] for {set i 50} {$i>0} {incr i -1} {

   puts [dict get [$it] title]
   after 300  ;# Slow the code down... :-)

}</lang>

Another approach: uses a class as specified in the task. Also, uses an html parser from

Library: tcllib

(parsing html with regular expressions is a particular annoyance of mine).

Works with: Tcl version 8.6

<lang tcl>package require Tcl 8.6 package require http package require htmlparse package require textutil::adjust

oo::class create yahoosearch {

   method search {s} {
       my variable searchterm page baseurl
       set searchterm $s
       set page 1
       set baseurl {http://ca.search.yahoo.com/search}
   }

   method getresults {} {
       my variable state results current_data
       set results [list]
       set current_data [dict create]
       set state looking_for_results
       htmlparse::parse -cmd [list [self] html_parser_callback] [my gethtml]
   }

   method nextpage {} {
       my variable page
       incr page 10
       my getresults
   }
   
   method nextresult {} {
       my variable results page
       if { ! [info exists results]} {
           my getresults
       } elseif {[llength $results] == 0} {
           my nextpage
       }
       set results [lassign $results result]
       return $result
   }

   method gethtml {} {
       my variable searchterm page baseurl
       set url [format {%s?%s} $baseurl [::http::formatQuery p $searchterm b $page]]
       set response [http::geturl $url]
       set html [http::data $response]
       http::cleanup $response
       return $html
   }

   method html_parser_callback {tag slash param textBehindTheTag} {
       my variable state results current_data
       switch -exact -- $state {
           looking_for_results {
               if {$tag eq "div" && [string first {id="main"} $param] != -1} {
                   set state ready
               }
           }
           ready {
               if {($tag eq "div" && [string first {class="res} $param] != -1) ||
                   ($tag eq "html" && $slash eq "/")
               } { #" -- unbalanced quote disturbs syntax highlighting
                   if {[dict size $current_data] > 0} {lappend results $current_data}
                   set current_data [dict create]
                   set state getting_url
               }
           }
           getting_url {
               if {$tag eq "a" && [string match "*yschttl spt*" $param]} {
                   if {[regexp {href="(.+?)"} $param - url]} {
                       dict set current_data url $url
                   } else {
                       dict set current_data url "no href in tag params: '$param'"
                   }
                   dict set current_data title $textBehindTheTag
                   set state getting_title
               }
           }
           getting_title {
               if {$tag eq "a" && $slash eq "/"} {
                   set state looking_for_abstract
               } else {
                   dict append current_data title $textBehindTheTag
               }
           }
           looking_for_abstract {
               if {$tag eq "span" && [string first {class="url} $param] != -1} {
                   set state ready
               } elseif {$tag eq "div" && [string first {class="abstr} $param] != -1} {
                   dict set current_data abstract $textBehindTheTag
                   set state getting_abstract
               }
           }
           getting_abstract {
               if {$tag eq "div" && $slash eq "/"} {
                   set state ready
               } else {
                   dict append current_data abstract $textBehindTheTag
               }
           }
       }
   }

}

yahoosearch create searcher searcher search "search text here"

for {set x 1} {$x <= 15} {incr x} {

   set result [searcher nextresult]
   dict with result {
       puts $title
       puts $url
       puts [textutil::adjust::indent [textutil::adjust::adjust $abstract] "  "]
       puts ""
   }

}</lang>

@@ Line 343: / Line 343: @@
 Uses {{libheader|RubyGems}} {{libheader|Hpricot}} to parse the HTML.  Someone more skillful than I at XPath or CSS could tighten up the <code>parse_html</code> method.
 <lang ruby>require 'open-uri'
-require 'rubygems'
 require 'hpricot'

Revision as of 19:50, 25 August 2009

AutoHotkey

C#

Perl

Python

R

Ruby

Tcl