Yahoo! search interface: Difference between revisions
Content added Content deleted
(→{{header|C sharp}}: fixed code lines length and adjusted extraction.) |
|||
Line 38: | Line 38: | ||
=={{header|C sharp}}== |
=={{header|C sharp}}== |
||
Generally it is not a good idea to scrape web pages. |
|||
{{lines too long|C sharp}} |
|||
E. g. all implementations for this task which regex for |
|||
"<a class=" fail by now, after Yahoo has changed it's output format. |
|||
<lang csharp>using System; |
<lang csharp>using System; |
||
using System.Net; |
using System.Net; |
||
using System.Text; |
|||
using System.Text.RegularExpressions; |
using System.Text.RegularExpressions; |
||
using System.Collections; |
|||
using System.Collections.Generic; |
using System.Collections.Generic; |
||
using System.Linq; |
|||
class YahooSearch { |
class YahooSearch { |
||
private string query; |
private string query; |
||
private string content; |
private string content; |
||
private int page |
private int page; |
||
const string yahoo = "http://search.yahoo.com/search?"; |
|||
⚫ | |||
⚫ | |||
⚫ | |||
this.content = new WebClient().DownloadString("http://search.yahoo.com/search?p=" + query); |
|||
⚫ | |||
public YahooSearch(string query, int page) { |
public YahooSearch(string query, int page) { |
||
this.query = query; |
this.query = query; |
||
this.page = page; |
this.page = page; |
||
this.content = new WebClient() |
this.content = new WebClient() |
||
( |
.DownloadString( |
||
string.Format(yahoo + "p={0}&b={1}", query, this.page * 10 + 1) |
|||
⚫ | |||
} |
} |
||
string Fix(string x) { |
|||
x = x.Replace("<b>", "").Replace("</b>", "").Replace("<wbr />", "").Replace("<wbr>", "").Replace("<b>...</b>", ""); |
|||
⚫ | |||
⚫ | |||
⚫ | |||
⚫ | |||
public YahooResult[] Results { |
public YahooResult[] Results { |
||
get { |
get { |
||
List<YahooResult> results = new List<YahooResult>(); |
|||
Func<string, string, string> substringBefore = (str, before) => |
|||
foreach (Match e in new Regex("<a class=\"yschttl spt\" href=\".+?\" >(.+?)</a>" + |
|||
{ |
|||
"</h3></div>(?(<div class=\"sm-bd sm-nophoto\" id=\"sm-bd-4-1\">.+?</div>))" + |
|||
int iHref = str.IndexOf(before); |
|||
return iHref < 0 ? "" : str.Substring(0, iHref); |
|||
}; |
|||
Func<string, string, string> substringAfter = (str, after) => |
|||
{ |
|||
⚫ | |||
⚫ | |||
}; |
|||
Converter<string, string> getText = p => |
|||
Regex.Replace(p, "<[^>]*>", x => ""); |
|||
Regex rx = new Regex(@" |
|||
⚫ | |||
<div \s class=""res""> |
|||
<div> |
|||
<h3> |
|||
<a \s (?'LinkAttributes'[^>]+)> |
|||
(?'LinkText' .*?) |
|||
(?></a>) |
|||
</h3> |
|||
</div> |
|||
<div \s class=""abstr""> |
|||
(?'Abstract' .*?) |
|||
(?></div>) |
|||
.*? |
|||
(></div>) |
|||
</li>", |
|||
RegexOptions.IgnorePatternWhitespace |
|||
| RegexOptions.ExplicitCapture |
|||
); |
|||
foreach (Match e in rx.Matches(this.content)) { |
|||
string rurl = getText(substringBefore(substringAfter( |
|||
e.Groups["LinkAttributes"].Value, @"href="""), @"""")); |
|||
string rtitle = getText(e.Groups["LinkText"].Value); |
|||
string rcontent = getText(e.Groups["Abstract"].Value); |
|||
results.Add(new YahooResult(rurl, rtitle, rcontent)); |
results.Add(new YahooResult(rurl, rtitle, rcontent)); |
||
} |
} |
||
return |
return results.ToArray(); |
||
} |
} |
||
} |
} |
||
Line 107: | Line 130: | ||
this.Title = title; |
this.Title = title; |
||
this.Content = content; |
this.Content = content; |
||
⚫ | |||
public override string ToString() |
|||
⚫ | |||
return string.Format("\nTitle: {0}\nLink: {1}\nText: {2}", |
|||
Title, URL, Content); |
|||
} |
} |
||
} |
} |
||
Line 114: | Line 143: | ||
class Prog { |
class Prog { |
||
static void Main() { |
static void Main() { |
||
foreach (int page in new[] { 0, 1 }) |
|||
{ |
|||
YahooSearch x = new YahooSearch("test", page); |
|||
foreach (YahooResult result in x.Results) |
foreach (YahooResult result in x.Results) |
||
{ |
|||
Console.WriteLine(result); |
|||
} |
|||
} |
} |
||
} |
} |
||
} |
|||
</lang> |
|||
=={{header|GUISS}}== |
=={{header|GUISS}}== |