Yahoo! search interface: Difference between revisions

Content added Content deleted
(→‎{{header|C sharp}}: fixed code lines length and adjusted extraction.)
Line 38: Line 38:


=={{header|C sharp}}==
=={{header|C sharp}}==
Generally it is not a good idea to scrape web pages.
{{lines too long|C sharp}}
E. g. all implementations for this task which regex for
"<a class=" fail by now, after Yahoo has changed it's output format.
<lang csharp>using System;
<lang csharp>using System;
using System.Net;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Text.RegularExpressions;
using System.Collections;
using System.Collections.Generic;
using System.Collections.Generic;
using System.Linq;


class YahooSearch {
class YahooSearch {
private string query;
private string query;
private string content;
private string content;
private int page = 1;
private int page;


const string yahoo = "http://search.yahoo.com/search?";
public YahooSearch(string query) {

this.query = query;
public YahooSearch(string query) : this(query, 0) { }
this.content = new WebClient().DownloadString("http://search.yahoo.com/search?p=" + query);
}


public YahooSearch(string query, int page) {
public YahooSearch(string query, int page) {
this.query = query;
this.query = query;
this.page = page;
this.page = page;
this.content = new WebClient().DownloadString(String.Format("http://search.yahoo.com/search?p={0}&b={1}", query,
this.content = new WebClient()
((this.page - 1) * 10) + 1));
.DownloadString(
string.Format(yahoo + "p={0}&b={1}", query, this.page * 10 + 1)
);
}
}

string Fix(string x) {
x = x.Replace("<b>", "").Replace("</b>", "").Replace("<wbr />", "").Replace("<wbr>", "").Replace("<b>...</b>", "");
int i = x.IndexOf("</a></h3>");

if (i > 0) return x.Substring(0, i);
else return x;
}


public YahooResult[] Results {
public YahooResult[] Results {
get {
get {
ArrayList results = new ArrayList();
List<YahooResult> results = new List<YahooResult>();


Func<string, string, string> substringBefore = (str, before) =>
foreach (Match e in new Regex("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>" +
{
"</h3></div>(?(<div class=\"sm-bd sm-nophoto\" id=\"sm-bd-4-1\">.+?</div>))" +
"<div class=\"abstr\">(.+?)</div><span class=url>(.+?)</span>").Matches(this.content)) {
int iHref = str.IndexOf(before);
string rurl = Fix(e.Groups[3].Value);
return iHref < 0 ? "" : str.Substring(0, iHref);
string rtitle = Fix(e.Groups[1].Value);
};
string rcontent = Fix(e.Groups[2].Value);
Func<string, string, string> substringAfter = (str, after) =>
{
int iHref = str.IndexOf(after);
return iHref < 0 ? "" : str.Substring(iHref + after.Length);
};
Converter<string, string> getText = p =>
Regex.Replace(p, "<[^>]*>", x => "");

Regex rx = new Regex(@"
<li>
<div \s class=""res"">
<div>
<h3>
<a \s (?'LinkAttributes'[^>]+)>
(?'LinkText' .*?)
(?></a>)
</h3>
</div>
<div \s class=""abstr"">
(?'Abstract' .*?)
(?></div>)
.*?
(></div>)
</li>",
RegexOptions.IgnorePatternWhitespace
| RegexOptions.ExplicitCapture
);
foreach (Match e in rx.Matches(this.content)) {
string rurl = getText(substringBefore(substringAfter(
e.Groups["LinkAttributes"].Value, @"href="""), @""""));
string rtitle = getText(e.Groups["LinkText"].Value);
string rcontent = getText(e.Groups["Abstract"].Value);
results.Add(new YahooResult(rurl, rtitle, rcontent));
results.Add(new YahooResult(rurl, rtitle, rcontent));
}
}
return (YahooResult[])results.ToArray(typeof(YahooResult));
return results.ToArray();
}
}
}
}
Line 107: Line 130:
this.Title = title;
this.Title = title;
this.Content = content;
this.Content = content;
}

public override string ToString()
{
return string.Format("\nTitle: {0}\nLink: {1}\nText: {2}",
Title, URL, Content);
}
}
}
}
Line 114: Line 143:
class Prog {
class Prog {
static void Main() {
static void Main() {
YahooSearch x = new YahooSearch("test");
foreach (int page in new[] { 0, 1 })
{
YahooSearch x = new YahooSearch("test", page);


foreach (YahooResult result in x.Results) {
foreach (YahooResult result in x.Results)
Console.WriteLine(result.Title);
{
Console.WriteLine(result);
}
}
}
}
}
}
}</lang>
</lang>


=={{header|GUISS}}==
=={{header|GUISS}}==