Anonymous user
Yahoo! search interface: Difference between revisions
→{{header|C sharp}}: fixed code lines length and adjusted extraction.
(→{{header|C sharp}}: fixed code lines length and adjusted extraction.) |
|||
Line 38:
=={{header|C sharp}}==
Generally it is not a good idea to scrape web pages.
E. g. all implementations for this task which regex for
"<a class=" fail by now, after Yahoo has changed it's output format.
<lang csharp>using System;
using System.Net;
using System.Text.RegularExpressions;
using System.Collections.Generic;
class YahooSearch {
private string query;
private string content;
private int page
const string yahoo = "http://search.yahoo.com/search?";
public YahooSearch(string query) {▼
this.query = query;▼
▲ public YahooSearch(string query) : this(query, 0) { }
}▼
public YahooSearch(string query, int page) {
this.query = query;
this.page = page;
this.content = new WebClient()
.DownloadString(
string.Format(yahoo + "p={0}&b={1}", query, this.page * 10 + 1)
}
int i = x.IndexOf("</a></h3>");▼
if (i > 0) return x.Substring(0, i);▼
else return x; ▼
} ▼
public YahooResult[] Results {
get {
Func<string, string, string> substringBefore = (str, before) =>
{
Func<string,
{
};
Converter<string, string> getText = p =>
Regex.Replace(p, "<[^>]*>", x => "");
Regex rx = new Regex(@"
<div \s class=""res"">
<div>
<h3>
<a \s (?'LinkAttributes'[^>]+)>
(?'LinkText' .*?)
(?></a>)
</h3>
</div>
<div \s class=""abstr"">
(?'Abstract' .*?)
(?></div>)
.*?
(></div>)
</li>",
RegexOptions.IgnorePatternWhitespace
| RegexOptions.ExplicitCapture
);
foreach (Match e in rx.Matches(this.content)) {
string rurl = getText(substringBefore(substringAfter(
e.Groups["LinkAttributes"].Value, @"href="""), @""""));
string rtitle = getText(e.Groups["LinkText"].Value);
string rcontent = getText(e.Groups["Abstract"].Value);
results.Add(new YahooResult(rurl, rtitle, rcontent));
}
return
}
}
Line 107 ⟶ 130:
this.Title = title;
this.Content = content;
▲ }
public override string ToString()
return string.Format("\nTitle: {0}\nLink: {1}\nText: {2}",
Title, URL, Content);
}
}
Line 114 ⟶ 143:
class Prog {
static void Main() {
{
YahooSearch x = new YahooSearch("test", page);
foreach (YahooResult result in x.Results)
Console.WriteLine(result);
}
}
}
}
=={{header|GUISS}}==
|