Yahoo! search interface
You are encouraged to solve this task according to the task description, using any language you may know.
Create a class for searching Yahoo results. It must implement a Next Page method, and read URL, Title and Content from results.
C#
<lang csharp>using System; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Collections; using System.Collections.Generic; using System.Linq;
class YahooSearch {
private string query; private string content; private int page = 1;
public YahooSearch(string query) { this.query = query; this.content = new WebClient().DownloadString("http://search.yahoo.com/search?p=" + query); }
public YahooSearch(string query, int page) { this.query = query; this.page = page; this.content = new WebClient().DownloadString(String.Format("http://search.yahoo.com/search?p={0}&b={1}", query, ((this.page - 1) * 10) + 1)); }
string Fix(string x) { x = x.Replace("", "").Replace("", "").Replace("", "").Replace(" ", "").Replace("...", "");
int i = x.IndexOf("</a>");
if (i > 0) return x.Substring(0, i); else return x; }
public YahooResult[] Results { get { ArrayList results = new ArrayList();
foreach (Match e in new Regex("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>(?(
))
(.+?)").Matches(this.content)) {
string rurl = Fix(e.Groups[3].Value); string rtitle = Fix(e.Groups[1].Value); string rcontent = Fix(e.Groups[2].Value); results.Add(new YahooResult(rurl, rtitle, rcontent)); } return (YahooResult[])results.ToArray(typeof(YahooResult)); } }
public YahooSearch NextPage() { return new YahooSearch(this.query, this.page + 1); }
public YahooSearch GetPage(int page) { return new YahooSearch(this.query, page); }
}
class YahooResult {
public string URL { get; set; } public string Title { get; set; } public string Content { get; set; }
public YahooResult(string url, string title, string content) { this.URL = url; this.Title = title; this.Content = content; }
}
// Usage:
class Prog {
static void Main() { YahooSearch x = new YahooSearch("test");
foreach (YahooResult result in x.Results) { Console.WriteLine(result.Title); } }
}</lang>
Perl
<lang perl>package YahooSearch;
use Encode; use HTTP::Cookies; use WWW::Mechanize;
- --- Internals -------------------------------------------------
sub apply (&$)
{my $f = shift; local $_ = shift; $f->(); return $_;}
- We construct a cookie to get 100 results per page and prevent
- "enhanced results".
my $search_prefs = 'v=1&n=100&sm=' .
apply {s/([^a-zA-Z0-9])/sprintf '%%%02X', ord $1/ge} join '|', map {'!' . $_} qw(hsb Zq0 XbM sss dDO VFM RQh uZ0 Fxe yCl GP4 FZK yNC mEG niH);
my $cookies = HTTP::Cookies->new; $cookies->set_cookie(0, 'sB', $search_prefs, '/', 'search.yahoo.com');
my $mech = new WWW::Mechanize
(cookie_jar => $cookies, stack_depth => 0);
sub read_page
{my ($next, $page, @results) = ($mech->find_link(text => 'Next >')->url, decode 'iso-8859-1', $mech->content); while ($page =~ m
{
<a \s class="yschttl \s spt" \s
href=" ([^"]+) " \s* > #"
(.+?) </a>
.+?
(.+?) }xg)
{push @results, {url => $1, title => $2, content => $3};
foreach ( @{$results[-1]}{qw(title content)} )
{s/<.+?>//g;
$_ = encode 'utf8', $_;}}
return $next, \@results;}
- --- Methods ---------------------------------------------------
sub new
{my $invocant = shift;
my $class = ref($invocant) || $invocant;
$mech->get('http://search.yahoo.com/search?p=' . apply
{s/([^a-zA-Z0-9 ])/sprintf '%%%02X', ord $1/ge;
s/ /+/g;}
shift);
my ($next, $results) = read_page();
return bless {link_to_next => $next, results => $results}, $class;}
sub results
{@{shift()->{results}};}
sub next_page
{my $invocant = shift;
my $next = $invocant->{link_to_next};
unless ($next)
{$invocant->{results} = [];
return undef;}
$mech->get($next);
($next, my $results) = read_page();
$invocant->{link_to_next} = $next;
$invocant->{results} = $results;
return 1;}</lang>
Python
<lang python>import urllib
import re
def fix(x):
x = x.replace("","").replace("","").replace(" ","").replace(" ","").replace("...","")
return x[:x.find("</a>
")]
class YahooSearch:
def __init__(self, query, page=1): self.query = query self.page = page self.url = "http://search.yahoo.com/search?p=%s&b=%s" %(self.query, ((self.page - 1) * 10 + 1)) self.content = urllib.urlopen(self.url).read() def getresults(self): self.results = []
for i in re.findall("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>"+\ "
(.+?)",self.content):
title = fix(i[0]) content = fix(i[1]) url = fix(i[2]) self.results.append(YahooResult(title, content, url)) return self.results def getnextpage(self): return YahooSearch(self.query, self.page+1) results = property(fget=getresults) nextpage = property(fget=getnextpage)
class YahooResult:
def __init__(self,title,content,url): self.title = title self.content = content self.url = url
- Usage:
x = YahooSearch("test")
for result in x.results:
print result.title</lang>
Tcl
<lang tcl>package require http
proc fix s {
string map {... "" "" """" " " ""} \
[regsub "</a>.*" $s ""]
} proc YahooSearch {term {page 1}} {
# Build the (ugly) scraper URL
append re {<a class="yschttl spt" href=".+?" >(.+?)</a>} append re {
(.+?)}
# Perform the query; note that this handles special characters # in the query term correctly set q [http::formatQuery p $term b [expr {$page*10-9}]] set token [http::geturl http://search.yahoo.com/search?$q] set data [http::data $token] http::cleanup $token
# Assemble the results into a nice list set results {} foreach {- title content url} [regexp -all -inline $re $data] { lappend results [fix $title] [fix $content] [fix $url] }
# set up the call for the next page interp alias {} Nextpage {} YahooSearch $term [incr page]
return $results
}
- Usage: get the first two pages of results
foreach {title content url} [YahooSearch "test"] {
puts $title
} foreach {title content url} [Nextpage] {
puts $title
}</lang>
With Tcl 8.6, more options are available for managing the global state, through objects and coroutines. First, an object-based solution that takes the basic YahooSearch functionality and dresses it up to be more Tcl-like: <lang tcl>package require Tcl 8.6
oo::class create WebSearcher {
variable page term results constructor searchTerm { set page 0 set term $searchTerm my nextPage } # This next method *is* a very Tcl-ish way of doing iteration. method for {titleVar contentsVar urlVar body} { upvar 1 $titleVar t $contentsVar c $urlVar v foreach {t c v} $results { uplevel 1 $body } } # Reuse the previous code for simplicity rather than writing it anew # Of course, if we were serious about this, we'd put the code here properly method nextPage {} { set results [YahooSearch $term [incr page]] return }
}
- How to use. Note the 'foreach' method use below; new "keywords" as methods!
set ytest [WebSearcher new "test"] $ytest for title - url {
puts "\"$title\" : $url"
} $ytest nextPage $ytest for title - url {
puts "\"$title\" : $url"
} $ytest delete ;# standard method that deletes the object</lang> However, the paradigm of an iterator is also interesting and is more appropriately supported through a coroutine. This version conceals the fact that the service produces output in pages; care should be taken with it because it can produce rather a lot of network traffic... <lang tcl>package require Tcl 8.6
proc yahoo! term {
coroutine yahoo![incr ::yahoo] apply {term { yield [info coroutine] while 1 { set results [YahooSearch $term [incr step]] if {[llength $results] == 0} { return -code break } foreach {t c u} $results { yield [dict create title $t content $c url $u] } } }} $term
}
- test by getting first fifty titles...
set it [yahoo! "test"] for {set i 50} {$i>0} {incr i -1} {
puts [dict get [$it] title] after 300 ;# Slow the code down... :-)
}</lang>
Another approach: uses a class as specified in the task. Also, uses an html parser from
(parsing html with regular expressions is a particular annoyance of mine).
<lang tcl>package require Tcl 8.6 package require http package require htmlparse package require textutil::adjust
oo::class create yahoosearch {
method search {s} { my variable searchterm page baseurl set searchterm $s set page 1 set baseurl {http://ca.search.yahoo.com/search} }
method getresults {} { my variable state results current_data set results [list] set current_data [dict create] set state looking_for_results htmlparse::parse -cmd [list [self] html_parser_callback] [my gethtml] }
method nextpage {} { my variable page incr page 10 my getresults } method nextresult {} { my variable results page if { ! [info exists results]} { my getresults } elseif {[llength $results] == 0} { my nextpage } set results [lassign $results result] return $result }
method gethtml {} { my variable searchterm page baseurl set url [format {%s?%s} $baseurl [::http::formatQuery p $searchterm b $page]] set response [http::geturl $url] set html [http::data $response] http::cleanup $response return $html }
method html_parser_callback {tag slash param textBehindTheTag} { my variable state results current_data switch -exact -- $state { looking_for_results { if {$tag eq "div" && [string first {id="main"} $param] != -1} { set state ready } } ready { if {($tag eq "div" && [string first {class="res} $param] != -1) || ($tag eq "html" && $slash eq "/") } { if {[dict size $current_data] > 0} {lappend results $current_data} set current_data [dict create] set state getting_url } } getting_url { if {$tag eq "a" && [string match "*yschttl spt*" $param]} { if {[regexp {href="(.+?)"} $param - url]} { dict set current_data url $url } else { dict set current_data url "no href in tag params: '$param'" } dict set current_data title $textBehindTheTag set state getting_title } } getting_title { if {$tag eq "a" && $slash eq "/"} { set state looking_for_abstract } else { dict append current_data title $textBehindTheTag } } looking_for_abstract { if {$tag eq "span" && [string first {class="url} $param] != -1} { set state ready } elseif {$tag eq "div" && [string first {class="abstr} $param] != -1} { dict set current_data abstract $textBehindTheTag set state getting_abstract } } getting_abstract { if {$tag eq "div" && $slash eq "/"} { set state ready } else { dict append current_data abstract $textBehindTheTag } } } }
}
yahoosearch create searcher searcher search "search text here"
for {set x 1} {$x <= 15} {incr x} {
set result [searcher nextresult] dict with result { puts $title puts $url puts [textutil::adjust::indent [textutil::adjust::adjust $abstract] " "] puts "" }
}</lang>