Yahoo! search interface: Difference between revisions
(→Tcl: Added implementation) |
m (→{{header|Tcl}}) |
||
Line 240: | Line 240: | ||
Tcl 8.6 can use its coroutine support to produce an iterator more like that Python code: |
Tcl 8.6 can use its coroutine support to produce an iterator more like that Python code: |
||
<lang tcl>proc yahoo! term { |
<lang tcl>proc yahoo! term { |
||
coroutine yahoo![incr ::yahoo] |
coroutine yahoo![incr ::yahoo] apply {term { |
||
yield [info coroutine] |
yield [info coroutine] |
||
while 1 { |
while 1 { |
||
Line 247: | Line 247: | ||
return -code break |
return -code break |
||
} |
} |
||
foreach {t c u} $ |
foreach {t c u} $results { |
||
yield [dict create title $t content $c url $u] |
yield [dict create title $t content $c url $u] |
||
} |
} |
||
Line 258: | Line 258: | ||
while 1 { |
while 1 { |
||
puts [dict get [$it] title] |
puts [dict get [$it] title] |
||
after 300 ;# Slow the code down... :-) |
|||
}</lang> |
}</lang> |
Revision as of 10:15, 14 May 2009
You are encouraged to solve this task according to the task description, using any language you may know.
Create a class for searching Yahoo results. It must implement a Next Page method, and read URL, Title and Content from results.
C#
<lang csharp>using System; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Collections; using System.Collections.Generic; using System.Linq;
class YahooSearch {
private string query; private string content; private int page = 1;
public YahooSearch(string query) { this.query = query; this.content = new WebClient().DownloadString("http://search.yahoo.com/search?p=" + query); }
public YahooSearch(string query, int page) { this.query = query; this.page = page; this.content = new WebClient().DownloadString(String.Format("http://search.yahoo.com/search?p={0}&b={1}", query, ((this.page - 1) * 10) + 1)); }
string Fix(string x) { x = x.Replace("", "").Replace("", "").Replace("", "").Replace(" ", "").Replace("...", "");
int i = x.IndexOf("</a>");
if (i > 0) return x.Substring(0, i); else return x; }
public YahooResult[] Results { get { ArrayList results = new ArrayList();
foreach (Match e in new Regex("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>(?(
))
(.+?)").Matches(this.content)) {
string rurl = Fix(e.Groups[3].Value); string rtitle = Fix(e.Groups[1].Value); string rcontent = Fix(e.Groups[2].Value); results.Add(new YahooResult(rurl, rtitle, rcontent)); } return (YahooResult[])results.ToArray(typeof(YahooResult)); } }
public YahooSearch NextPage() { return new YahooSearch(this.query, this.page + 1); }
public YahooSearch GetPage(int page) { return new YahooSearch(this.query, page); }
}
class YahooResult {
public string URL { get; set; } public string Title { get; set; } public string Content { get; set; }
public YahooResult(string url, string title, string content) { this.URL = url; this.Title = title; this.Content = content; }
}
// Usage:
class Prog {
static void Main() { YahooSearch x = new YahooSearch("test");
foreach (YahooResult result in x.Results) { Console.WriteLine(result.Title); } }
}</lang>
Perl
<lang perl>package YahooSearch;
use Encode; use HTTP::Cookies; use WWW::Mechanize;
- --- Internals -------------------------------------------------
sub apply (&$)
{my $f = shift; local $_ = shift; $f->(); return $_;}
- We construct a cookie to get 100 results per page and prevent
- "enhanced results".
my $search_prefs = 'v=1&n=100&sm=' .
apply {s/([^a-zA-Z0-9])/sprintf '%%%02X', ord $1/ge} join '|', map {'!' . $_} qw(hsb Zq0 XbM sss dDO VFM RQh uZ0 Fxe yCl GP4 FZK yNC mEG niH);
my $cookies = HTTP::Cookies->new; $cookies->set_cookie(0, 'sB', $search_prefs, '/', 'search.yahoo.com');
my $mech = new WWW::Mechanize
(cookie_jar => $cookies, stack_depth => 0);
sub read_page
{my ($next, $page, @results) = ($mech->find_link(text => 'Next >')->url, decode 'iso-8859-1', $mech->content); while ($page =~ m
{
<a \s class="yschttl \s spt" \s
href=" ([^"]+) " \s* > #"
(.+?) </a>
.+?
(.+?) }xg)
{push @results, {url => $1, title => $2, content => $3};
foreach ( @{$results[-1]}{qw(title content)} )
{s/<.+?>//g;
$_ = encode 'utf8', $_;}}
return $next, \@results;}
- --- Methods ---------------------------------------------------
sub new
{my $invocant = shift;
my $class = ref($invocant) || $invocant;
$mech->get('http://search.yahoo.com/search?p=' . apply
{s/([^a-zA-Z0-9 ])/sprintf '%%%02X', ord $1/ge;
s/ /+/g;}
shift);
my ($next, $results) = read_page();
return bless {link_to_next => $next, results => $results}, $class;}
sub results
{@{shift()->{results}};}
sub next_page
{my $invocant = shift;
my $next = $invocant->{link_to_next};
unless ($next)
{$invocant->{results} = [];
return undef;}
$mech->get($next);
($next, my $results) = read_page();
$invocant->{link_to_next} = $next;
$invocant->{results} = $results;
return 1;}</lang>
Python
<lang python>import urllib
import re
def fix(x):
x = x.replace("","").replace("","").replace(" ","").replace(" ","").replace("...","")
return x[:x.find("</a>
")]
class YahooSearch:
def __init__(self, query, page=1): self.query = query self.page = page self.url = "http://search.yahoo.com/search?p=%s&b=%s" %(self.query, ((self.page - 1) * 10 + 1)) self.content = urllib.urlopen(self.url).read() def getresults(self): self.results = []
for i in re.findall("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>"+\ "
(.+?)",self.content):
title = fix(i[0]) content = fix(i[1]) url = fix(i[2]) self.results.append(YahooResult(title, content, url)) return self.results def getnextpage(self): return YahooSearch(self.query, self.page+1) results = property(fget=getresults) nextpage = property(fget=getnextpage)
class YahooResult:
def __init__(self,title,content,url): self.title = title self.content = content self.url = url
- Usage:
x = YahooSearch("test")
for result in x.results:
print result.title</lang>
Tcl
<lang tcl>package require http
proc fix s {
string map {... "" "" """" " " ""} \
[regsub "</a>.*" $s ""]
} proc YahooSearch {term {page 1}} {
# Build the (ugly) scraper URL
append re {<a class="yschttl spt" href=".+?" >(.+?)</a>} append re {
(.+?)}
# Perform the query; note that this handles special characters # in the query term correctly set q [http::formatQuery p $term b [expr {$page*10-9}]] set token [http::geturl http://search.yahoo.com/search?$q] set data [http::data $token] http::cleanup $token
# Assemble the results into a nice list set results {} foreach {- title content url} [regexp -all -inline $re $data] { lappend results [fix $title] [fix $content] [fix $url] } return $results
}
- Usage:
foreach {title content url} [YahooSearch "test"] {
puts $title
}</lang> Tcl 8.6 can use its coroutine support to produce an iterator more like that Python code: <lang tcl>proc yahoo! term {
coroutine yahoo![incr ::yahoo] apply {term { yield [info coroutine] while 1 { set results [YahooSearch $term [incr step]] if {[llength $results] == 0} { return -code break } foreach {t c u} $results { yield [dict create title $t content $c url $u] } } }} $term
}
- test...
set it [yahoo! "test"] while 1 {
puts [dict get [$it] title] after 300 ;# Slow the code down... :-)
}</lang>