Yahoo! search interface: Difference between revisions

→‎{{header|C sharp}}: fixed code lines length and adjusted extraction.
(→‎{{header|C sharp}}: fixed code lines length and adjusted extraction.)
Line 38:
 
=={{header|C sharp}}==
Generally it is not a good idea to scrape web pages.
{{lines too long|C sharp}}
E. g. all implementations for this task which regex for
"<a class=" fail by now, after Yahoo has changed it's output format.
<lang csharp>using System;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
 
class YahooSearch {
private string query;
private string content;
private int page = 1;
 
const string yahoo = "http://search.yahoo.com/search?";
public YahooSearch(string query) {
 
this.query = query;
public YahooSearch(string query) : this(query, 0) { }
this.content = new WebClient().DownloadString("http://search.yahoo.com/search?p=" + query);
}
 
public YahooSearch(string query, int page) {
this.query = query;
this.page = page;
this.content = new WebClient().DownloadString(String.Format("http://search.yahoo.com/search?p={0}&b={1}", query,
.DownloadString((this.page - 1) * 10) + 1));
string.Format(yahoo + "p={0}&b={1}", query, this.page * 10 + 1)
this.query = query );
}
 
string Fix(string x) {
x = x.Replace("<b>", "").Replace("</b>", "").Replace("<wbr />", "").Replace("<wbr>", "").Replace("<b>...</b>", "");
int i = x.IndexOf("</a></h3>");
 
if (i > 0) return x.Substring(0, i);
else return x;
}
 
public YahooResult[] Results {
get {
ArrayListList<YahooResult> results = new ArrayListList<YahooResult>();
 
Func<string, string, string> substringBefore = (str, before) =>
foreach (Match e in new Regex("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>" +
{
"</h3></div>(?(<div class=\"sm-bd sm-nophoto\" id=\"sm-bd-4-1\">.+?</div>))" +
"<divint iHref class=\"abstr\">(.+?)</div><span class=url>(str.+?)</span>").MatchesIndexOf(this.contentbefore)) {;
stringreturn rurliHref =< Fix(e0 ? "" : str.Groups[3].ValueSubstring(0, iHref);
string rtitle = Fix(e.Groups[1].Value)};
Func<string, string, string> rcontentsubstringAfter = Fix(e.Groups[2].Valuestr, after); =>
{
int iiHref = xstr.IndexOf("</a></h3>"after);
if (i > 0) return xiHref < 0 ? "" : str.Substring(0,iHref i+ after.Length);
};
Converter<string, string> getText = p =>
Regex.Replace(p, "<[^>]*>", x => "");
 
Regex rx = new Regex(@"
else return x; <li>
<div \s class=""res"">
<div>
<h3>
<a \s (?'LinkAttributes'[^>]+)>
(?'LinkText' .*?)
(?></a>)
</h3>
</div>
<div \s class=""abstr"">
(?'Abstract' .*?)
(?></div>)
.*?
(></div>)
</li>",
RegexOptions.IgnorePatternWhitespace
| RegexOptions.ExplicitCapture
);
foreach (Match e in rx.Matches(this.content)) {
string rurl = getText(substringBefore(substringAfter(
e.Groups["LinkAttributes"].Value, @"href="""), @""""));
string rtitle = getText(e.Groups["LinkText"].Value);
string rcontent = getText(e.Groups["Abstract"].Value);
results.Add(new YahooResult(rurl, rtitle, rcontent));
}
return (YahooResult[])results.ToArray(typeof(YahooResult));
}
}
Line 107 ⟶ 130:
this.Title = title;
this.Content = content;
}
 
public override string ToString()
} {
return string.Format("\nTitle: {0}\nLink: {1}\nText: {2}",
Title, URL, Content);
}
}
Line 114 ⟶ 143:
class Prog {
static void Main() {
YahooSearchforeach x(int =page in new[] YahooSearch("test"{ 0, 1 });
{
YahooSearch x = new YahooSearch("test", page);
 
foreach (YahooResult result in x.Results) {
Console.WriteLine(result.Title);{
Console.WriteLine(result);
}
}
}
}
}</lang>
 
=={{header|GUISS}}==
Anonymous user