Yahoo! search interface: Difference between revisions
(Added R code) |
m (→{{header|R}}: Added default page number) |
||
Line 248: | Line 248: | ||
Rather than using regexes to find the content (like some of the other solutions here) this method parses the HMTL and finds the appropriate sections. |
Rather than using regexes to find the content (like some of the other solutions here) this method parses the HMTL and finds the appropriate sections. |
||
<lang R> |
<lang R> |
||
YahooSearch <- function(query, page, .opts=list(), ignoreMarkUpErrors=TRUE) |
YahooSearch <- function(query, page=1, .opts=list(), ignoreMarkUpErrors=TRUE) |
||
{ |
{ |
||
if(!require(RCurl) || !require(XML)) |
if(!require(RCurl) || !require(XML)) |
||
Line 336: | Line 336: | ||
#Usage |
#Usage |
||
YahooSearch("rosetta code" |
YahooSearch("rosetta code") |
||
nextpage() |
nextpage() |
||
</lang> |
</lang> |
Revision as of 13:35, 6 August 2009
You are encouraged to solve this task according to the task description, using any language you may know.
Create a class for searching Yahoo results. It must implement a Next Page method, and read URL, Title and Content from results.
AutoHotkey
translated from python example <lang AutoHotkey> test: yahooSearch("test", 1) yahooSearch("test", 2) return
yahooSearch(query, page) {
global start := ((page - 1) * 10) + 1 filedelete, search.txt urldownloadtofile, % "http://search.yahoo.com/search?p=" . query . "&b=" . start, search.txt fileread, content, search.txt
reg = <a class="yschttl spt" href=".+?" >(.+?)</a>
(.+?)
index := found := 1 while (found := regexmatch(content, reg, self, found + 1)) { msgbox % title%A_Index% := fix(self1) content%A_Index% := fix(self2) url%A_Index% := fix(self3) }
}
fix(url) {
if pos := instr(url, "</a>")
StringLeft, url, url, pos - 1 url := regexreplace(url, "<.*?>") return url }
</lang>
C#
<lang csharp>using System; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Collections; using System.Collections.Generic; using System.Linq;
class YahooSearch {
private string query; private string content; private int page = 1;
public YahooSearch(string query) { this.query = query; this.content = new WebClient().DownloadString("http://search.yahoo.com/search?p=" + query); }
public YahooSearch(string query, int page) { this.query = query; this.page = page; this.content = new WebClient().DownloadString(String.Format("http://search.yahoo.com/search?p={0}&b={1}", query, ((this.page - 1) * 10) + 1)); }
string Fix(string x) { x = x.Replace("", "").Replace("", "").Replace("", "").Replace(" ", "").Replace("...", "");
int i = x.IndexOf("</a>");
if (i > 0) return x.Substring(0, i); else return x; }
public YahooResult[] Results { get { ArrayList results = new ArrayList();
foreach (Match e in new Regex("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>(?(
))
(.+?)").Matches(this.content)) {
string rurl = Fix(e.Groups[3].Value); string rtitle = Fix(e.Groups[1].Value); string rcontent = Fix(e.Groups[2].Value); results.Add(new YahooResult(rurl, rtitle, rcontent)); } return (YahooResult[])results.ToArray(typeof(YahooResult)); } }
public YahooSearch NextPage() { return new YahooSearch(this.query, this.page + 1); }
public YahooSearch GetPage(int page) { return new YahooSearch(this.query, page); }
}
class YahooResult {
public string URL { get; set; } public string Title { get; set; } public string Content { get; set; }
public YahooResult(string url, string title, string content) { this.URL = url; this.Title = title; this.Content = content; }
}
// Usage:
class Prog {
static void Main() { YahooSearch x = new YahooSearch("test");
foreach (YahooResult result in x.Results) { Console.WriteLine(result.Title); } }
}</lang>
Perl
<lang perl>package YahooSearch;
use Encode; use HTTP::Cookies; use WWW::Mechanize;
- --- Internals -------------------------------------------------
sub apply (&$)
{my $f = shift; local $_ = shift; $f->(); return $_;}
- We construct a cookie to get 100 results per page and prevent
- "enhanced results".
my $search_prefs = 'v=1&n=100&sm=' .
apply {s/([^a-zA-Z0-9])/sprintf '%%%02X', ord $1/ge} join '|', map {'!' . $_} qw(hsb Zq0 XbM sss dDO VFM RQh uZ0 Fxe yCl GP4 FZK yNC mEG niH);
my $cookies = HTTP::Cookies->new; $cookies->set_cookie(0, 'sB', $search_prefs, '/', 'search.yahoo.com');
my $mech = new WWW::Mechanize
(cookie_jar => $cookies, stack_depth => 0);
sub read_page
{my ($next, $page, @results) = ($mech->find_link(text => 'Next >')->url, decode 'iso-8859-1', $mech->content); while ($page =~ m
{
<a \s class="yschttl \s spt" \s
href=" ([^"]+) " \s* > #"
(.+?) </a>
.+?
(.+?) }xg)
{push @results, {url => $1, title => $2, content => $3};
foreach ( @{$results[-1]}{qw(title content)} )
{s/<.+?>//g;
$_ = encode 'utf8', $_;}}
return $next, \@results;}
- --- Methods ---------------------------------------------------
sub new
{my $invocant = shift;
my $class = ref($invocant) || $invocant;
$mech->get('http://search.yahoo.com/search?p=' . apply
{s/([^a-zA-Z0-9 ])/sprintf '%%%02X', ord $1/ge;
s/ /+/g;}
shift);
my ($next, $results) = read_page();
return bless {link_to_next => $next, results => $results}, $class;}
sub results
{@{shift()->{results}};}
sub next_page
{my $invocant = shift;
my $next = $invocant->{link_to_next};
unless ($next)
{$invocant->{results} = [];
return undef;}
$mech->get($next);
($next, my $results) = read_page();
$invocant->{link_to_next} = $next;
$invocant->{results} = $results;
return 1;}</lang>
Python
<lang python>import urllib
import re
def fix(x):
x = x.replace("","").replace("","").replace(" ","").replace(" ","").replace("...","")
return x[:x.find("</a>
")]
class YahooSearch:
def __init__(self, query, page=1): self.query = query self.page = page self.url = "http://search.yahoo.com/search?p=%s&b=%s" %(self.query, ((self.page - 1) * 10 + 1)) self.content = urllib.urlopen(self.url).read() def getresults(self): self.results = []
for i in re.findall("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>"+\ "
(.+?)",self.content):
title = fix(i[0]) content = fix(i[1]) url = fix(i[2]) self.results.append(YahooResult(title, content, url)) return self.results def getnextpage(self): return YahooSearch(self.query, self.page+1) results = property(fget=getresults) nextpage = property(fget=getnextpage)
class YahooResult:
def __init__(self,title,content,url): self.title = title self.content = content self.url = url
- Usage:
x = YahooSearch("test")
for result in x.results:
print result.title</lang>
R
Rather than using regexes to find the content (like some of the other solutions here) this method parses the HMTL and finds the appropriate sections. <lang R> YahooSearch <- function(query, page=1, .opts=list(), ignoreMarkUpErrors=TRUE) {
if(!require(RCurl) || !require(XML)) { stop("Could not load required packages") } # Replace " " with "%20", etc query <- curlEscape(query) # Retrieve page b <- 10*(page-1)+1 theurl <- paste("http://uk.search.yahoo.com/search?p=", query, "&b=", b, sep="") webpage <- getURL(theurl, .opts=.opts) # Save search for nextpage function .Search <- list(query=query, page=page, .opts=.opts, ignoreMarkUpErrors=ignoreMarkUpErrors) assign(".Search", .Search, envir=globalenv()) # Parse HTML; retrieve results block webpage <- readLines(tc <- textConnection(webpage)); close(tc) if(ignoreMarkUpErrors) { pagetree <- htmlTreeParse(webpage, error=function(...){}) } else { pagetree <- htmlTreeParse(webpage) } findbyattr <- function(x, id, type="id") { ids <- sapply(x, function(x) x$attributes[type]) x[ids==id] } body <- pagetree$children$html$children$body bd <- findbyattr(body$children$div$children, "bd") left <- findbyattr(bd$div$children$div$children, "left") web <- findbyattr(left$div$children$div$children, "web") resol <- web$div$children$ol #Get url, title, content from results gettextfromnode <- function(x) { un <- unlist(x$children) paste(un[grep("value", names(un))], collapse=" ") } n <- length(resol) results <- list() length(results) <- n for(i in 1:n) { mainlink <- resoli$children$div$children[1]$div$children$h3$children$a url <- mainlink$attributes["href"] title <- gettextfromnode(mainlink) contenttext <- findbyattr(resoli$children$div$children[2], "abstr", type="class") if(length(contenttext)==0) { contenttext <- findbyattr(resoli$children$div$children[2]$div$children$div$children, "sm-abs", type="class") } content <- gettextfromnode(contenttext$div) resultsi <- list(url=url, title=title, content=content) } names(results) <- as.character(seq(b, b+n-1)) results
}
nextpage <- function() {
if(exists(".Search", envir=globalenv())) { .Search <- get(".Search", envir=globalenv()) .Search$page <- .Search$page + 1L do.call(YahooSearch, .Search) } else { message("No search has been performed yet") }
}
- Usage
YahooSearch("rosetta code") nextpage() </lang>
Ruby
Uses
to parse the HTML. Someone more skillful than I at XPath or CSS could tighten up the parse_html
method.
<lang ruby>require 'open-uri' require 'rubygems' require 'hpricot'
SearchResult = Struct.new(:url, :title, :content)
class SearchYahoo
@@urlinfo = [nil, 'ca.search.yahoo.com', 80, '/search', nil, nil]
def initialize(term) @term = term @page = 1 @results = nil @url = URI::HTTP.build(@@urlinfo) end
def next_result if not @results @results = [] fetch_results elsif @results.empty? next_page end @results.shift end
def fetch_results @url.query = URI.escape("p=%s&b=%d" % [@term, @page]) doc = open(@url) { |f| Hpricot(f) } parse_html(doc) end
def next_page @page += 10 fetch_results end
def parse_html(doc) doc.search("div#main").search("div").each do |div| next unless div.has_attribute?("class") and div.get_attribute("class").index("res") == 0 result = SearchResult.new div.search("a").each do |link| next unless link.has_attribute?("class") and link.get_attribute("class") == "yschttl spt" result.url = link.get_attribute("href") result.title = link.inner_text end div.search("div").each do |abstract| next unless abstract.has_attribute?("class") and abstract.get_attribute("class").index("abstr") result.content = abstract.inner_text end @results << result end end
end
s = SearchYahoo.new("test") 15.times do |i|
result = s.next_result puts i+1 puts result.title puts result.url puts result.content puts
end</lang>
Tcl
<lang tcl>package require http
proc fix s {
string map {... "" "" """" " " ""} \
[regsub "</a>.*" $s ""]
} proc YahooSearch {term {page 1}} {
# Build the (ugly) scraper URL
append re {<a class="yschttl spt" href=".+?" >(.+?)</a>} append re {
(.+?)}
# Perform the query; note that this handles special characters # in the query term correctly set q [http::formatQuery p $term b [expr {$page*10-9}]] set token [http::geturl http://search.yahoo.com/search?$q] set data [http::data $token] http::cleanup $token
# Assemble the results into a nice list set results {} foreach {- title content url} [regexp -all -inline $re $data] { lappend results [fix $title] [fix $content] [fix $url] }
# set up the call for the next page interp alias {} Nextpage {} YahooSearch $term [incr page]
return $results
}
- Usage: get the first two pages of results
foreach {title content url} [YahooSearch "test"] {
puts $title
} foreach {title content url} [Nextpage] {
puts $title
}</lang>
With Tcl 8.6, more options are available for managing the global state, through objects and coroutines. First, an object-based solution that takes the basic YahooSearch functionality and dresses it up to be more Tcl-like: <lang tcl>package require Tcl 8.6
oo::class create WebSearcher {
variable page term results constructor searchTerm { set page 0 set term $searchTerm my nextPage } # This next method *is* a very Tcl-ish way of doing iteration. method for {titleVar contentsVar urlVar body} { upvar 1 $titleVar t $contentsVar c $urlVar v foreach {t c v} $results { uplevel 1 $body } } # Reuse the previous code for simplicity rather than writing it anew # Of course, if we were serious about this, we'd put the code here properly method nextPage {} { set results [YahooSearch $term [incr page]] return }
}
- How to use. Note the 'foreach' method use below; new "keywords" as methods!
set ytest [WebSearcher new "test"] $ytest for title - url {
puts "\"$title\" : $url"
} $ytest nextPage $ytest for title - url {
puts "\"$title\" : $url"
} $ytest delete ;# standard method that deletes the object</lang> However, the paradigm of an iterator is also interesting and is more appropriately supported through a coroutine. This version conceals the fact that the service produces output in pages; care should be taken with it because it can produce rather a lot of network traffic... <lang tcl>package require Tcl 8.6
proc yahoo! term {
coroutine yahoo![incr ::yahoo] apply {term { yield [info coroutine] while 1 { set results [YahooSearch $term [incr step]] if {[llength $results] == 0} { return -code break } foreach {t c u} $results { yield [dict create title $t content $c url $u] } } }} $term
}
- test by getting first fifty titles...
set it [yahoo! "test"] for {set i 50} {$i>0} {incr i -1} {
puts [dict get [$it] title] after 300 ;# Slow the code down... :-)
}</lang>
Another approach: uses a class as specified in the task. Also, uses an html parser from
(parsing html with regular expressions is a particular annoyance of mine).
<lang tcl>package require Tcl 8.6 package require http package require htmlparse package require textutil::adjust
oo::class create yahoosearch {
method search {s} { my variable searchterm page baseurl set searchterm $s set page 1 set baseurl {http://ca.search.yahoo.com/search} }
method getresults {} { my variable state results current_data set results [list] set current_data [dict create] set state looking_for_results htmlparse::parse -cmd [list [self] html_parser_callback] [my gethtml] }
method nextpage {} { my variable page incr page 10 my getresults } method nextresult {} { my variable results page if { ! [info exists results]} { my getresults } elseif {[llength $results] == 0} { my nextpage } set results [lassign $results result] return $result }
method gethtml {} { my variable searchterm page baseurl set url [format {%s?%s} $baseurl [::http::formatQuery p $searchterm b $page]] set response [http::geturl $url] set html [http::data $response] http::cleanup $response return $html }
method html_parser_callback {tag slash param textBehindTheTag} { my variable state results current_data switch -exact -- $state { looking_for_results { if {$tag eq "div" && [string first {id="main"} $param] != -1} { set state ready } } ready { if {($tag eq "div" && [string first {class="res} $param] != -1) || ($tag eq "html" && $slash eq "/") } { #" -- unbalanced quote disturbs syntax highlighting if {[dict size $current_data] > 0} {lappend results $current_data} set current_data [dict create] set state getting_url } } getting_url { if {$tag eq "a" && [string match "*yschttl spt*" $param]} { if {[regexp {href="(.+?)"} $param - url]} { dict set current_data url $url } else { dict set current_data url "no href in tag params: '$param'" } dict set current_data title $textBehindTheTag set state getting_title } } getting_title { if {$tag eq "a" && $slash eq "/"} { set state looking_for_abstract } else { dict append current_data title $textBehindTheTag } } looking_for_abstract { if {$tag eq "span" && [string first {class="url} $param] != -1} { set state ready } elseif {$tag eq "div" && [string first {class="abstr} $param] != -1} { dict set current_data abstract $textBehindTheTag set state getting_abstract } } getting_abstract { if {$tag eq "div" && $slash eq "/"} { set state ready } else { dict append current_data abstract $textBehindTheTag } } } }
}
yahoosearch create searcher searcher search "search text here"
for {set x 1} {$x <= 15} {incr x} {
set result [searcher nextresult] dict with result { puts $title puts $url puts [textutil::adjust::indent [textutil::adjust::adjust $abstract] " "] puts "" }
}</lang>