Yahoo! search interface: Difference between revisions
(implement in nim lang) |
|||
Line 1,539:
Text : Last week Christian Drumm (@ceedee666) and Fred Verheul (@fredverheul) had a short conversation on ...</pre>
=={{header|Mathematica}}/{{header|Wolfram Language}}==
We cannot define a class in Mathematica, so I generate a "Manipulate" object instead.
<lang>Manipulate[
|
Revision as of 11:51, 5 September 2021
You are encouraged to solve this task according to the task description, using any language you may know.
Create a class for searching Yahoo! results.
It must implement a Next Page method, and read URL, Title and Content from results.
AArch64 Assembly
<lang AArch64 Assembly> /* ARM assembly AARCH64 Raspberry PI 3B */ /* program yahoosearch64.s */
/* access RosettaCode.org and data extract */ /* use openssl for access to port 443 */ /* test openssl : package libssl-dev */ /*******************************************/ /* Constantes file */ /*******************************************/ /* for this file see task include a file in language AArch64 assembly*/ .include "../includeConstantesARM64.inc"
.equ TAILLEBUFFER, 500
.equ SSL_OP_NO_SSLv3, 0x02000000 .equ SSL_OP_NO_COMPRESSION, 0x00020000 .equ SSL_MODE_AUTO_RETRY, 0x00000004 .equ SSL_CTRL_MODE, 33
.equ BIO_C_SET_CONNECT, 100 .equ BIO_C_DO_STATE_MACHINE, 101 .equ BIO_C_SET_SSL, 109 .equ BIO_C_GET_SSL, 110
.equ LGBUFFERREQ, 512001
/*********************************/ /* Initialized data */ /*********************************/ .data szMessDebutPgm: .asciz "Début du programme. \n" szRetourLigne: .asciz "\n" szMessFinOK: .asciz "Fin normale du programme. \n" szMessErreur: .asciz "Erreur !!!" szMessExtractArea: .asciz "Extraction = " szNomSite1: .asciz "search.yahoo.com:443" // host name and port szLibStart: .asciz ">Rosetta Code" // search string szNomrepCertif: .asciz "/pi/certificats" szRequete1: .asciz "GET /search?p=\"Rosettacode.org\"&b=1 HTTP/1.1 \r\nHost: search.yahoo.com\r\nConnection: keep-alive\r\nContent-Type: text/plain\r\n\r\n" /*********************************/ /* UnInitialized data */ /*********************************/ .bss .align 4 sBufferreq: .skip LGBUFFERREQ szExtractArea: .skip TAILLEBUFFER stNewSSL: .skip 200 /*********************************/ /* code section */ /*********************************/ .text .global main main:
ldr x0,qAdrszMessDebutPgm bl affichageMess // start message
/* connexion host port 443 and send query */ bl envoiRequete cmp x0,#-1 beq 99f // error ?
bl analyseReponse
ldr x0,qAdrszMessFinOK // end message bl affichageMess mov x0, #0 // return code ok b 100f
99:
ldr x0,qAdrszMessErreur // error bl affichageMess mov x0, #1 // return code error b 100f
100:
mov x8,EXIT // program end svc 0 // system call
qAdrszMessDebutPgm: .quad szMessDebutPgm qAdrszMessFinOK: .quad szMessFinOK qAdrszMessErreur: .quad szMessErreur
/*********************************************************/ /* connexion host port 443 and send query */ /*********************************************************/ envoiRequete:
stp x1,lr,[sp,-16]! // save registers stp x2,x3,[sp,-16]! // save registers stp x4,x5,[sp,-16]! // save registers //************************************* // openSsl functions use * //************************************* //init ssl bl OPENSSL_init_crypto bl ERR_load_BIO_strings mov x2, #0 mov x1, #0 mov x0, #2 bl OPENSSL_init_crypto mov x2, #0 mov x1, #0 mov x0, #0 bl OPENSSL_init_ssl cmp x0,#0 blt erreur bl TLS_client_method bl SSL_CTX_new cmp x0,#0 ble erreur mov x20,x0 // save contex ldr x1,iFlag bl SSL_CTX_set_options mov x0,x20 // contex mov x1,#0 ldr x2,qAdrszNomrepCertif bl SSL_CTX_load_verify_locations cmp x0,#0 ble erreur mov x0,x20 // contex bl BIO_new_ssl_connect cmp x0,#0 ble erreur mov x21,x0 // save bio mov x1,#BIO_C_GET_SSL mov x2,#0 ldr x3,qAdrstNewSSL bl BIO_ctrl ldr x0,qAdrstNewSSL ldr x0,[x0] mov x1,#SSL_CTRL_MODE mov x2,#SSL_MODE_AUTO_RETRY mov x3,#0 bl SSL_ctrl mov x0,x21 // bio mov x1,#BIO_C_SET_CONNECT mov x2,#0 ldr x3,qAdrszNomSite1 bl BIO_ctrl mov x0,x21 // bio mov x1,#BIO_C_DO_STATE_MACHINE mov x2,#0 mov x3,#0 bl BIO_ctrl // compute query length mov x2,#0 ldr x1,qAdrszRequete1 // query
1:
ldrb w0,[x1,x2] cmp x0,#0 add x8,x2,1 csel x2,x8,x2,ne bne 1b // send query mov x0,x21 // bio // x1 = address query // x2 = length query mov x3,#0 bl BIO_write // send query cmp x0,#0 blt erreur ldr x22,qAdrsBufferreq // buffer address
2: // begin loop to read datas
mov x0,x21 // bio mov x1,x22 // buffer address ldr x2,qLgBuffer mov x3,#0 bl BIO_read cmp x0,#0 ble 4f // error ou pb server add x22,x22,x0 sub x2,x22,#8 ldr x2,[x2] ldr x3,qCharEnd cmp x2,x3 // text end ? beq 4f mov x1,#0xFFFFFF // delay loop
3:
subs x1,x1,1 bgt 3b b 2b // loop read other chunk
4: // read end
//ldr x0,qAdrsBufferreq // to display buffer response of the query //bl affichageMess mov x0, x21 // close bio bl BIO_free_all mov x0,#0 b 100f
erreur: // error display
ldr x1,qAdrszMessErreur bl afficheErreur mov x0,#-1 // error code b 100f
100:
ldp x4,x5,[sp],16 // restaur 2 registers ldp x2,x3,[sp],16 // restaur 2 registers ldp x1,lr,[sp],16 // restaur 2 registers ret // return to address lr x30
qAdrszRequete1: .quad szRequete1 qAdrsBufferreq: .quad sBufferreq iFlag: .quad SSL_OP_NO_SSLv3 | SSL_OP_NO_COMPRESSION qAdrstNewSSL: .quad stNewSSL qAdrszNomSite1: .quad szNomSite1 qAdrszNomrepCertif: .quad szNomrepCertif qCharEnd: .quad 0x0A0D0A0D300A0D0A qLgBuffer: .quad LGBUFFERREQ - 1 /*********************************************************/ /* response analyze */ /*********************************************************/ analyseReponse:
stp x1,lr,[sp,-16]! // save registers stp x2,x3,[sp,-16]! // save registers ldr x0,qAdrsBufferreq // buffer address ldr x1,qAdrszLibStart // key text address mov x2,#2 // occurence key text mov x3,#-11 // offset ldr x4,qAdrszExtractArea // address result area bl extChaine cmp x0,#-1 beq 99f ldr x0,qAdrszMessExtractArea bl affichageMess ldr x0,qAdrszExtractArea // résult display bl affichageMess ldr x0,qAdrszRetourLigne bl affichageMess b 100f
99:
ldr x0,qAdrszMessErreur // error bl affichageMess mov x0, #-1 // error return code b 100f
100:
ldp x2,x3,[sp],16 // restaur 2 registers ldp x1,lr,[sp],16 // restaur 2 registers ret // return to address lr x30
qAdrszLibStart: .quad szLibStart qAdrszExtractArea: .quad szExtractArea qAdrszMessExtractArea: .quad szMessExtractArea qAdrszRetourLigne: .quad szRetourLigne /*********************************************************/ /* Text Extraction behind text key */ /*********************************************************/ /* x0 buffer address */ /* x1 key text to search */ /* x2 number occurences to key text */ /* x3 offset */ /* x4 result address */ extChaine:
stp x1,lr,[sp,-16]! // save registers stp x2,x3,[sp,-16]! // save registers stp x4,x5,[sp,-16]! // save registers stp x6,x7,[sp,-16]! // save registers mov x5,x0 // save buffer address mov x6,x1 // save key text // compute text length mov x8,#0
1: // loop
ldrb w0,[x5,x8] // load a byte cmp x0,#0 // end ? add x9,x8,1 csel x8,x9,x8,ne bne 1b // no -> loop add x8,x8,x5 // compute text end
mov x7,#0
2: // compute length text key
ldrb w0,[x6,x7] cmp x0,#0 add x9,x7,1 csel x7,x9,x7,ne bne 2b
3: // loop to search niéme(x2) key text
mov x0,x5 mov x1,x6 bl rechercheSousChaine cmp x0,#0 blt 100f subs x2,x2,1 ble 31f add x5,x5,x0 add x5,x5,x7 b 3b
31:
add x0,x0,x5 // add address text to index add x3,x3,x0 // add offset sub x3,x3,1 // and add length key text add x3,x3,x7 cmp x3,x8 // > at text end bge 98f mov x0,0
4: // character loop copy
ldrb w2,[x3,x0] strb w2,[x4,x0] cbz x2,99f // text end ? return zero cmp x0,48 // extraction length beq 5f add x0,x0,1 b 4b // and loop
5:
mov x2,0 // store final zéro strb w2,[x4,x0] add x0,x0,1 add x0,x0,x3 // x0 return the last position of extraction // it is possible o search another text b 100f
98:
mov x0,-1 // error b 100f
99:
mov x0,0
100:
ldp x6,x7,[sp],16 // restaur 2 registers ldp x4,x5,[sp],16 // restaur 2 registers ldp x2,x3,[sp],16 // restaur 2 registers ldp x1,lr,[sp],16 // restaur 2 registers ret // return to address lr x30
/******************************************************************/ /* search substring in string */ /******************************************************************/ /* x0 contains address string */ /* x1 contains address substring */ /* x0 return start index substring or -1 if not find */ rechercheSousChaine:
stp x1,lr,[sp,-16]! // save registers stp x2,x3,[sp,-16]! // save registers stp x4,x5,[sp,-16]! // save registers stp x6,x7,[sp,-16]! // save registers mov x2,#0 // index position string mov x3,#0 // index position substring mov x6,#-1 // search index ldrb w4,[x1,x3] // load first byte substring cbz x4,99f // zero final ? error
1:
ldrb w5,[x0,x2] // load string byte cbz x5,99f // zero final ? yes -> not found cmp x5,x4 // compare character two strings beq 2f mov x6,-1 // not equal - > raz index mov x3,0 // and raz byte counter ldrb w4,[x1,x3] // and load byte add x2,x2,1 // and increment byte counter b 1b // and loop
2: // characters equal
cmp x6,-1 // first character equal ? csel x6,x2,x6,eq // yes -> start index in x6 add x3,x3,1 // increment substring counter ldrb w4,[x1,x3] // and load next byte cbz x4,3f // zero final ? yes -> search end add x2,x2,1 // else increment string index b 1b // and loop
3:
mov x0,x6 // return start index substring in the string b 100f
99:
mov x0,-1 // not found
100:
ldp x6,x7,[sp],16 // restaur 2 registers ldp x4,x5,[sp],16 // restaur 2 registers ldp x2,x3,[sp],16 // restaur 2 registers ldp x1,lr,[sp],16 // restaur 2 registers ret // return to address lr x30
/********************************************************/ /* File Include fonctions */ /********************************************************/ /* for this file see task include a file in language AArch64 assembly */ .include "../includeARM64.inc" </lang>
- Output:
Début du programme. Extraction = Rosetta Code is a programming chrestomathy site. Fin normale du programme.
ARM Assembly
<lang ARM Assembly> /* ARM assembly Raspberry PI */ /* program yahoosearch.s */ /* access RosettaCode.org and data extract */ /* use openssl for access to port 443 */ /* test openssl : package libssl-dev */
/* REMARK 1 : this program use routines in a include file
see task Include a file language arm assembly for the routine affichageMess conversion10 see at end of this program the instruction include */
/*******************************************/ /* Constantes */ /*******************************************/ .equ STDOUT, 1 @ Linux output console .equ EXIT, 1 @ Linux syscall .equ WRITE, 4 @ Linux syscall .equ BRK, 0x2d @ Linux syscall .equ CHARPOS, '@'
.equ EXIT, 1 .equ TAILLEBUFFER, 500
.equ SSL_OP_NO_SSLv3, 0x02000000 .equ SSL_OP_NO_COMPRESSION, 0x00020000 .equ SSL_MODE_AUTO_RETRY, 0x00000004 .equ SSL_CTRL_MODE, 33
.equ BIO_C_SET_CONNECT, 100 .equ BIO_C_DO_STATE_MACHINE, 101 .equ BIO_C_SET_SSL, 109 .equ BIO_C_GET_SSL, 110
.equ LGBUFFERREQ, 512001 .equ LGBUFFER2, 128001
/*********************************/ /* Initialized data */ /*********************************/ .data szMessDebutPgm: .asciz "Début du programme. \n" szRetourLigne: .asciz "\n" szMessFinOK: .asciz "Fin normale du programme. \n" szMessErreur: .asciz "Erreur !!!" szMessExtractArea: .asciz "Extraction = " szNomSite1: .asciz "search.yahoo.com:443" @ host name and port szLibStart: .asciz ">Rosetta Code" @ search string szNomrepCertif: .asciz "/pi/certificats" szRequete1: .asciz "GET /search?p=\"Rosettacode.org\"&b=1 HTTP/1.1 \r\nHost: search.yahoo.com\r\nConnection: keep-alive\r\nContent-Type: text/plain\r\n\r\n" /*********************************/ /* UnInitialized data */ /*********************************/ .bss .align 4 sBufferreq: .skip LGBUFFERREQ szExtractArea: .skip TAILLEBUFFER stNewSSL: .skip 200 /*********************************/ /* code section */ /*********************************/ .text .global main main:
ldr r0,iAdrszMessDebutPgm bl affichageMess @ start message
/* connexion host port 443 and send query */ bl envoiRequete cmp r0,#-1 beq 99f @ error ?
bl analyseReponse
ldr r0,iAdrszMessFinOK @ end message bl affichageMess mov r0, #0 @ return code ok b 100f
99:
ldr r0,iAdrszMessErreur @ error bl affichageMess mov r0, #1 @ return code error b 100f
100:
mov r7,#EXIT @ program end svc #0 @ system call
iAdrszMessDebutPgm: .int szMessDebutPgm iAdrszMessFinOK: .int szMessFinOK iAdrszMessErreur: .int szMessErreur
/*********************************************************/ /* connexion host port 443 and send query */ /*********************************************************/ envoiRequete:
push {r2-r8,lr} @ save registers @************************************* @ openSsl functions use * @************************************* @init ssl bl OPENSSL_init_crypto bl ERR_load_BIO_strings mov r2, #0 mov r1, #0 mov r0, #2 bl OPENSSL_init_crypto mov r2, #0 mov r1, #0 mov r0, #0 bl OPENSSL_init_ssl cmp r0,#0 blt erreur bl TLS_client_method bl SSL_CTX_new cmp r0,#0 ble erreur mov r6,r0 @ save ctx ldr r1,iFlag bl SSL_CTX_set_options mov r0,r6 mov r1,#0 ldr r2,iAdrszNomrepCertif bl SSL_CTX_load_verify_locations cmp r0,#0 ble erreur mov r0,r6 bl BIO_new_ssl_connect cmp r0,#0 ble erreur mov r5,r0 @ save bio mov r1,#BIO_C_GET_SSL mov r2,#0 ldr r3,iAdrstNewSSL bl BIO_ctrl ldr r0,iAdrstNewSSL ldr r0,[r0] mov r1,#SSL_CTRL_MODE mov r2,#SSL_MODE_AUTO_RETRY mov r3,#0 bl SSL_ctrl mov r0,r5 @ bio mov r1,#BIO_C_SET_CONNECT mov r2,#0 ldr r3,iAdrszNomSite1 bl BIO_ctrl mov r0,r5 @ bio mov r1,#BIO_C_DO_STATE_MACHINE mov r2,#0 mov r3,#0 bl BIO_ctrl @ compute query length mov r2,#0 ldr r1,iAdrszRequete1 @ query
1:
ldrb r0,[r1,r2] cmp r0,#0 addne r2,#1 bne 1b @ send query mov r0,r5 @ bio @ r1 = address query @ r2 = length query mov r3,#0 bl BIO_write @ send query cmp r0,#0 blt erreur ldr r7,iAdrsBufferreq @ buffer address
2: @ begin loop to read datas
mov r0,r5 @ bio mov r1,r7 @ buffer address mov r2,#LGBUFFERREQ - 1 mov r3,#0 bl BIO_read cmp r0,#0 ble 4f @ error ou pb server add r7,r0 sub r2,r7,#6 ldr r2,[r2] ldr r3,iCharEnd cmp r2,r3 @ text end ? beq 4f mov r1,#0xFFFFFF @ delay loop
3:
subs r1,#1 bgt 3b b 2b @ loop read other chunk
4: @ read end
//ldr r0,iAdrsBufferreq @ to display buffer response of the query //bl affichageMess mov r0, r5 @ close bio bl BIO_free_all mov r0,#0 b 100f
erreur: @ error display
ldr r1,iAdrszMessErreur bl afficheerreur mov r0,#-1 @ error code b 100f
100:
pop {r2-r8,lr} @ restaur registers bx lr
iAdrszRequete1: .int szRequete1 iAdrsBufferreq: .int sBufferreq iFlag: .int SSL_OP_NO_SSLv3 | SSL_OP_NO_COMPRESSION iAdrstNewSSL: .int stNewSSL iAdrszNomSite1: .int szNomSite1 iAdrszNomrepCertif: .int szNomrepCertif iCharEnd: .int 0x0A0D300A /*********************************************************/ /* response analyze */ /*********************************************************/ analyseReponse:
push {r1-r4,lr} @ save registers ldr r0,iAdrsBufferreq @ buffer address ldr r1,iAdrszLibStart @ key text address mov r2,#2 @ occurence key text mov r3,#-11 @ offset ldr r4,iAdrszExtractArea @ address result area bl extChaine cmp r0,#-1 beq 99f ldr r0,iAdrszMessExtractArea bl affichageMess ldr r0,iAdrszExtractArea @ résult display bl affichageMess ldr r0,iAdrszRetourLigne bl affichageMess b 100f
99:
ldr r0,iAdrszMessErreur @ error bl affichageMess mov r0, #-1 @ error return code b 100f
100:
pop {r1-r4,lr} @ restaur registers bx lr
iAdrszLibStart: .int szLibStart iAdrszExtractArea: .int szExtractArea iAdrszMessExtractArea: .int szMessExtractArea iAdrszRetourLigne: .int szRetourLigne /*********************************************************/ /* Text Extraction behind text key */ /*********************************************************/ /* r0 buffer address */ /* r1 key text to search */ /* r2 number occurences to key text */ /* r3 offset */ /* r4 result address */ extChaine:
push {r2-r8,lr} @ save registers mov r5,r0 @ save buffer address mov r6,r1 @ save key text @ compute text length mov r7,#0
1: @ loop
ldrb r0,[r5,r7] @ load a byte cmp r0,#0 @ end ? addne r7,#1 @ no -> loop bne 1b add r7,r5 @ compute text end
mov r8,#0
2: @ compute length text key
ldrb r0,[r6,r8] cmp r0,#0 addne r8,#1 bne 2b
3: @ loop to search nième(r2) key text
mov r0,r5 mov r1,r6 bl rechercheSousChaine cmp r0,#0 blt 100f subs r2,#1 addgt r5,r0 addgt r5,r8 bgt 3b add r0,r5 @ add address text to index add r3,r0 @ add offset sub r3,#1 @ and add length key text add r3,r8 cmp r3,r7 @ > at text end movge r0,#-1 @ yes -> error bge 100f mov r0,#0
4: @ character loop copy
ldrb r2,[r3,r0] strb r2,[r4,r0] cmp r2,#0 @ text end ? moveq r0,#0 @ return zero beq 100f cmp r0,#48 @ extraction length beq 5f add r0,#1 b 4b @ and loop
5:
mov r2,#0 @ store final zéro strb r2,[r4,r0] add r0,#1 add r0,r3 @ r0 return the last position of extraction @ it is possible o search another text
100:
pop {r2-r8,lr} @ restaur registers bx lr
/******************************************************************/ /* search substring in string */ /******************************************************************/ /* r0 contains address string */ /* r1 contains address substring */ /* r0 return start index substring or -1 if not find */ rechercheSousChaine:
push {r1-r6,lr} @ save registers mov r2,#0 @ index position string mov r3,#0 @ index position substring mov r6,#-1 @ search index ldrb r4,[r1,r3] @ load first byte substring cmp r4,#0 @ zero final ? moveq r0,#-1 @ error beq 100f
1:
ldrb r5,[r0,r2] @ load string byte cmp r5,#0 @ zero final ? moveq r0,#-1 @ yes -> not find beq 100f cmp r5,r4 @ compare character two strings beq 2f mov r6,#-1 @ not equal - > raz index mov r3,#0 @ and raz byte counter ldrb r4,[r1,r3] @ and load byte add r2,#1 @ and increment byte counter b 1b @ and loop
2: @ characters equal
cmp r6,#-1 @ first character equal ? moveq r6,r2 @ yes -> start index in r6 add r3,#1 @ increment substring counter ldrb r4,[r1,r3] @ and load next byte cmp r4,#0 @ zero final ? beq 3f @ yes -> search end add r2,#1 @ else increment string index b 1b @ and loop
3:
mov r0,r6 @ return start index substring in the string
100:
pop {r1-r6,lr} @ restaur registres bx lr
/***************************************************/ /* ROUTINES INCLUDE */ /***************************************************/ .include "../affichage.inc" </lang>
- Output:
Début du programme. Extraction = Rosetta Code is a programming chrestomathy site. Fin normale du programme.
AutoHotkey
translated from python example <lang AutoHotkey>test: yahooSearch("test", 1) yahooSearch("test", 2) return
yahooSearch(query, page) {
global start := ((page - 1) * 10) + 1 filedelete, search.txt urldownloadtofile, % "http://search.yahoo.com/search?p=" . query . "&b=" . start, search.txt fileread, content, search.txt
reg = <a class="yschttl spt" href=".+?" >(.+?)</a>
(.+?)
index := found := 1 while (found := regexmatch(content, reg, self, found + 1)) { msgbox % title%A_Index% := fix(self1) content%A_Index% := fix(self2) url%A_Index% := fix(self3) }
}
fix(url) {
if pos := instr(url, "</a>")
StringLeft, url, url, pos - 1 url := regexreplace(url, "<.*?>") return url }</lang>
C#
Generally it is not a good idea to scrape web pages. E. g. all implementations for this task which regex for "<a class=" fail by now, after Yahoo has changed its output format. <lang csharp>using System; using System.Net; using System.Text.RegularExpressions; using System.Collections.Generic;
class YahooSearch {
private string query; private string content; private int page;
const string yahoo = "http://search.yahoo.com/search?";
public YahooSearch(string query) : this(query, 0) { }
public YahooSearch(string query, int page) { this.query = query; this.page = page; this.content = new WebClient() .DownloadString( string.Format(yahoo + "p={0}&b={1}", query, this.page * 10 + 1) ); }
public YahooResult[] Results { get { List<YahooResult> results = new List<YahooResult>();
Func<string, string, string> substringBefore = (str, before) => { int iHref = str.IndexOf(before); return iHref < 0 ? "" : str.Substring(0, iHref); }; Func<string, string, string> substringAfter = (str, after) => { int iHref = str.IndexOf(after); return iHref < 0 ? "" : str.Substring(iHref + after.Length); }; Converter<string, string> getText = p => Regex.Replace(p, "<[^>]*>", x => "");
Regex rx = new Regex(@"
<a \s (?'LinkAttributes'[^>]+)> (?'LinkText' .*?) (?></a>)
(?'Abstract' .*?)(?>
.*?(?>
", RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture ); foreach (Match e in rx.Matches(this.content)) { string rurl = getText(substringBefore(substringAfter( e.Groups["LinkAttributes"].Value, @"href="""), @"""")); string rtitle = getText(e.Groups["LinkText"].Value); string rcontent = getText(e.Groups["Abstract"].Value); results.Add(new YahooResult(rurl, rtitle, rcontent)); } return results.ToArray(); } } public YahooSearch NextPage() { return new YahooSearch(this.query, this.page + 1); } public YahooSearch GetPage(int page) { return new YahooSearch(this.query, page); } } class YahooResult { public string URL { get; set; } public string Title { get; set; } public string Content { get; set; } public YahooResult(string url, string title, string content) { this.URL = url; this.Title = title; this.Content = content; } public override string ToString() { return string.Format("\nTitle: {0}\nLink: {1}\nText: {2}", Title, URL, Content); } } // Usage: class Prog { static void Main() { foreach (int page in new[] { 0, 1 }) { YahooSearch x = new YahooSearch("test", page); foreach (YahooResult result in x.Results) { Console.WriteLine(result); } } } } </lang>
D
<lang d>import std.stdio, std.exception, std.regex, std.algorithm, std.string,
std.net.curl;
struct YahooResult {
string url, title, content;
string toString() const { return "\nTitle: %s\nLink: %s\nText: %s" .format(title, url, content); }
}
struct YahooSearch {
private string query, content; private uint page;
this(in string query_, in uint page_ = 0) { this.query = query_; this.page = page_; this.content = "http://search.yahoo.com/search?p=%s&b=%d" .format(query, page * 10 + 1).get.assumeUnique; }
@property results() const {
immutable re = `
<a \s (?P<linkAttributes> [^>]+)> (?P<linkText> .*?) </a>
(?P<abstract> .*?)
.*?
`; const clean = (string s) => s.replace("<[^>]*>".regex("g"),""); return content.match(re.regex("gx")).map!(m => YahooResult( clean(m.captures["linkAttributes"] .findSplitAfter(`href="`)[1] .findSplitBefore(`"`)[0]), clean(m.captures["linkText"]), clean(m.captures["abstract"]) )); } YahooSearch nextPage() const { return YahooSearch(query, page + 1); } } void main() { writefln("%(%s\n%)", "test".YahooSearch.results); }</lang>
- Output (shortened):
Title: Test.com Link: http://www.test.com/ Text: Test.com provides a complete software solution for creating online tests and managing enterprise and specialist certification programs, in up to 22 languages. Title: Speakeasy Speed Test Link: http://www.speakeasy.net/speedtest/ Text: Test your Internet Connection with Speakeasy's reliable and accurate broadband speed test. What's your speed? Title: Test | Define Test at Dictionary.com Link: http://dictionary.reference.com/browse/test Text: noun 1. the means by which the presence, quality, or genuineness of anything is determined; a means of trial. 2. the trial of the quality of something: to put to the ...
Gambas
<lang gambas>Public Sub Form_Open() Dim hWebView As WebView
Me.Arrangement = Arrange.Fill Me.Maximized = True Me.Title = "Yahoo! search interface"
hWebView = New WebView(Me) hWebView.Expand = True hWebView.URL = "https://www.yahoo.com"
End</lang> Click here to see output (I have typed 'rosettacode' in the search box)
Go
Yahoo! has evidently changed its search output format over the years and, if it is currently documented anywhere, then I couldn't find it.
The regular expression used below was figured out by studying the raw HTML and works fine as at 18th November, 2019. <lang go>package main
import (
"fmt" "golang.org/x/net/html" "io/ioutil" "net/http" "regexp" "strings"
)
var (
expr = `
<a class=.*?href="(.*?)".*?>(.*?)</a>
` + `.*?
(.*?)
`
rx = regexp.MustCompile(expr)
)
type YahooResult struct {
title, url, content string
}
func (yr YahooResult) String() string {
return fmt.Sprintf("Title : %s\nUrl : %s\nContent: %s\n", yr.title, yr.url, yr.content)
}
type YahooSearch struct {
query string page int
}
func (ys YahooSearch) results() []YahooResult {
search := fmt.Sprintf("http://search.yahoo.com/search?p=%s&b=%d", ys.query, ys.page*10+1) resp, _ := http.Get(search) body, _ := ioutil.ReadAll(resp.Body) s := string(body) defer resp.Body.Close() var results []YahooResult for _, f := range rx.FindAllStringSubmatch(s, -1) { yr := YahooResult{} yr.title = html.UnescapeString(strings.ReplaceAll(strings.ReplaceAll(f[2], "", ""), "", "")) yr.url = f[1] yr.content = html.UnescapeString(strings.ReplaceAll(strings.ReplaceAll(f[3], "", ""), "", "")) results = append(results, yr) } return results
}
func (ys YahooSearch) nextPage() YahooSearch {
return YahooSearch{ys.query, ys.page + 1}
}
func main() {
ys := YahooSearch{"rosettacode", 0} // Limit output to first 5 entries, say, from pages 1 and 2. fmt.Println("PAGE 1 =>\n") for _, res := range ys.results()[0:5] { fmt.Println(res) } fmt.Println("PAGE 2 =>\n") for _, res := range ys.nextPage().results()[0:5] { fmt.Println(res) }
}</lang>
- Output:
Note there is some repetition between the pages.
PAGE 1 => Title : Rosetta Code Url : https://rosettacode.org/wiki/Rosetta_Code Content: Rosetta Code Rosetta Code is a programming chrestomathy site. Rosetta Code currently has 976 tasks, 231 draft tasks, and is aware of 756 languages, though we do not (and cannot) have solutions to every task in every language. 1 Places to start Title : Rosetta Code - Wikipedia Url : https://en.wikipedia.org/wiki/Rosetta_Code Content: Rosetta Code is a wiki -based programming chrestomathy website with implementations of common algorithms and solutions to various programming problems in many different programming languages. 1 Website 1.1 Data and structure 1.2 Languages Title : Rosetta Code (@rosettacode) | Twitter Url : https://twitter.com/rosettacode Content: The latest Tweets from Rosetta Code (@rosettacode). Twitter account for http://t.co/DuRZFWDfRn. The general idea here is for short announcements and the like. The ... Title : Best of Rosettacode Url : https://examples.p6c.dev/categories/best-of-rosettacode.html Content: 99 Problems Rosettacode Cookbook Euler Games Interpreters Modules Other Grammars Perlmonks Rosalind Shootout ... Title : Rosetta Code Blog Url : https://blog.rosettacode.org/ Content: (If you point 'rosettacode.com' to RosettaCode.org's IP address, you should still be able to see it) Second, I don't care if you want to use the name 'rosettacode' or 'rosetta code' in similar pursuits. I love that people have been calling task pages that have cropped up on various forums around the web as "rosetta code problems." That speaks ... PAGE 2 => Title : Rosetta Code | R-bloggers Url : https://www.r-bloggers.com/rosetta-code/ Content: Rosetta Code is a programming chrestomathy site. The idea is to present solutions to the same task in as many different languages as possible, to demonstrate how languages are similar and different, and to aid a person with a grounding in one approach to a problem in learning another. Title : Best of Rosettacode Url : https://examples.p6c.dev/categories/best-of-rosettacode.html Content: 99 Problems Rosettacode Cookbook Euler Games Interpreters Modules Other Grammars Perlmonks Rosalind Shootout ... Title : Rosetta Code Blog Url : https://blog.rosettacode.org/ Content: (If you point 'rosettacode.com' to RosettaCode.org's IP address, you should still be able to see it) Second, I don't care if you want to use the name 'rosettacode' or 'rosetta code' in similar pursuits. I love that people have been calling task pages that have cropped up on various forums around the web as "rosetta code problems." That speaks ... Title : What exactly is the purpose of Rosetta Code? - Quora Url : https://www.quora.com/What-exactly-is-the-purpose-of-Rosetta-Code Content: The name is a play on the Rosetta Stone. The Rosetta Stone featured a decree by King Ptolomy written in three scripts - Egyption hieroglyphs, Demotic, and Ancient Greek. Title : One R Tip A Day: Rosetta Code Url : https://onertipaday.blogspot.com/2009/07/rosetta-code.html Content: Today I'd like to suggest the interesting Rosetta Code site: Rosetta Code is a programming chrestomathy site. The idea is to present solutions to the same task in as many different languages as possible, to demonstrate how languages are similar and different, and to aid a person with a grounding in one approach to a problem in learning another.
GUISS
<lang guiss>Start,Programs,Applications,Mozilla Firefox,Inputbox:address bar>www.yahoo.co.uk, Button:Go,Area:browser window,Inputbox:searchbox>elephants,Button:Search</lang>
Haskell
Haskell is not an object oriented language, so this example does not implement an object class. However, it can be interesting as an example of how HTML source code can be parsed using the Parsec library. <lang Haskell>import Network.HTTP import Text.Parsec
data YahooSearchItem = YahooSearchItem {
itemUrl, itemTitle, itemContent :: String }
data YahooSearch = YahooSearch {
searchQuery :: String, searchPage :: Int, searchItems :: [YahooSearchItem] }
-- URL for Yahoo! searches, without giving a page number yahooUrl = "http://search.yahoo.com/search?p="
-- make an HTTP request and return a YahooSearch yahoo :: String -> IO YahooSearch yahoo q = simpleHTTP (getRequest $ yahooUrl ++ q) >>=
getResponseBody >>= return . YahooSearch q 1 . items
-- get some results and return the next page of results next :: YahooSearch -> IO YahooSearch next (YahooSearch q p _) =
simpleHTTP (getRequest $ -- add the page number to the search yahooUrl ++ q ++ "&b=" ++ show (p + 1)) >>= getResponseBody >>= return . YahooSearch q (p + 1) . items
printResults :: YahooSearch -> IO () printResults (YahooSearch q p items) = do
putStrLn $ "Showing Yahoo! search results for query: " ++ q putStrLn $ "Page: " ++ show p putChar '\n' mapM_ printOne items where printOne (YahooSearchItem itemUrl itemTitle itemContent) = do putStrLn $ "URL : " ++ itemUrl putStrLn $ "Title : " ++ itemTitle putStrLn $ "Abstr : " ++ itemContent putChar '\n'
urlTag, titleTag, contentTag1, contentTag2, ignoreTag,
ignoreText :: Parsec String () String
-- parse a tag containing the URL of a search result urlTag = do { string "<a id=\"link-";
many digit; string "\" class=\"yschttl spt\" href=\""; url <- manyTill anyChar (char '"'); manyTill anyChar (char '>'); return url }
-- the title comes after the URL tag, so parse it first, discard it -- and get the title text titleTag = do { urlTag; manyTill anyChar (try (string "</a>")) }
-- parse a tag containing the description of the search result -- the tag can be named "sm-abs" or "abstr"
contentTag1 = do { string "
")) } contentTag2 = do { string "
")) }
-- parse a tag and discard it ignoreTag = do { char ('<'); manyTill anyChar (char '>');
return "" }
-- parse some text and discard it ignoreText = do { many1 (noneOf "<"); return "" }
-- return only non-empty strings nonempty :: [String] -> Parsec String () [String] nonempty xs = return [ x | x <- xs, not (null x) ]
-- a template to parse a whole source file looking for items of the -- same class parseCategory x = do
res <- many x eof nonempty res
urls, titles, contents :: Parsec String () [String]
-- parse HTML source looking for URL tags of the search results urls = parseCategory url where
url = (try urlTag) <|> ignoreTag <|> ignoreText
-- parse HTML source looking for titles of the search results titles = parseCategory title where
title = (try titleTag) <|> ignoreTag <|> ignoreText
-- parse HTML source looking for descriptions of the search results contents = parseCategory content where
content = (try contentTag1) <|> (try contentTag2) <|> ignoreTag <|> ignoreText
-- parse the HTML source three times looking for URL, title and -- description of all search results and return them as a list of -- YahooSearchItem items :: String -> [YahooSearchItem] items q =
let ignoreOrKeep = either (const []) id us = ignoreOrKeep $ parse urls "" q ts = ignoreOrKeep $ parse titles "" q cs = ignoreOrKeep $ parse contents "" q in [ YahooSearchItem { itemUrl = u, itemTitle = t, itemContent = c } | (u, t, c) <- zip3 us ts cs ]
</lang>
Simple invocation from GHCi:
yahoo "Rosetta%20code" >>= printResults
. Notice that spaces must be expressed as "%20", because spaces are not allowed in URLs.
Icon and Unicon
The following uses the Unicon pre-processor and messaging extensions and won't run under Icon without significant modification. The code provides a suitable demonstration; however, could be made more robust by things such as URL escaping the search string <lang Icon>link printf,strings
procedure main() YS := YahooSearch("rosettacode") every 1 to 2 do { # 2 pages
YS.readnext() YS.showinfo() }
end
class YahooSearch(urlpat,page,response) #: class for Yahoo Search
method readnext() #: read the next page of search results self.page +:= 1 # can't find as w|w/o self readurl() end method readurl() #: read the url url := sprintf(self.urlpat,(self.page-1)*10+1) m := open(url,"m") | stop("Unable to open : ",url) every (self.response := "") ||:= |read(m) close(m) self.response := deletec(self.response,"\x00") # kill stray NULs end
method showinfo() #: show the info of interest self.response ? repeat { (tab(find("<")) & ="<a class=\"yschttl spt\" href=\"") | break url := tab(find("\"")) & tab(find(">")+1)
title := tab(find("<")) & ="</a>" tab(find("<")) & =("
printf("\nTitle : %i\n",title) printf("URL : %i\n",url) printf("Abstr : %i\n",abstr) } end
initially(searchtext) #: initialize each instance
urlpat := sprintf("http://search.yahoo.com/search?p=%s&b=%%d",searchtext) page := 0
end</lang>
printf.icn provides formatting strings.icn provides deletec
Sample Output (truncated):Title : "<b>Rosetta Code</b> - <b>Rosetta Code</b>" URL : "http://rosettacode.org/" Abstr : "<b>Rosetta Code</b> is a programming chrestomathy site. The idea is to present solutions to the same task in as many different languages as possible, t o demonstrate how ..." Title : "<b>Rosetta Code</b> - Wikipedia, the free <wbr />encyclopedia" URL : "http://en.wikipedia.org/wiki/Rosetta_Code" Abstr : " <b>Rosetta Code</b> is a wiki -based programming chrestomathy website with solutions to various programming problems in many different programming lan guages. It was created ..." Title : "Category:AutoHotkey - <b>Rosetta Code</b>" URL : "http://rosettacode.org/wiki/Category:AutoHotkey" Abstr : "Listed below are all of the tasks on <b>Rosetta Code</b> which have bee n solved using AutoHotkey." ... Title : "RosettaCON2011 Tutorials Collection | <wbr />RosettaCommons" URL : "http://www.rosettacommons.org/" Abstr : "Foldit in the news. Cooper et al. 2010 Predicting protein structures wi th a multiplayer online game, Nature 466 , 756 see also video. Rosetta-3.3 is no w available!" Title : "CALL: call a SUBROUTINE - HicEst: <wbr />Windows IDE programming ..." URL : "http://www.hicest.com/CALL.htm" Abstr : "\xe2\x87\x92 Example of a CALL call in "Roman_numerals" (<b>R osettaCode</b>) CALL transfers control to the first statement of a SUBROUTINE. C ALL subroutine_name[argument1, ..."
Java
<lang java>import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; import java.net.URLDecoder; import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern;
class YahooSearch {
private String query; // Page number private int page = 1; // Regexp to look for the individual results in the returned page private static final Pattern pattern = Pattern.compile("<a class=\"yschttl spt\" href=\"[^*]+?\\*\\*([^\"]+?)\">(.+?)</a>.*?
public YahooSearch(String query) { this.query = query; }
public List<YahooResult> search() throws MalformedURLException, URISyntaxException, IOException { // Build the search string, starting with the Yahoo search URL, // then appending the query and optionally the page number (if > 1) StringBuilder searchUrl = new StringBuilder("http://search.yahoo.com/search?"); searchUrl.append("p=").append(URLEncoder.encode(query, "UTF-8")); if (page > 1) {searchUrl.append("&b=").append((page - 1) * 10 + 1);} // Query the Yahoo search engine URL url = new URL(searchUrl.toString()); List<YahooResult> result = new ArrayList<YahooResult>(); StringBuilder sb = new StringBuilder(); // Get the search results using a buffered reader BufferedReader in = null; try { in = new BufferedReader(new InputStreamReader(url.openStream())); // Read the results line by line String line = in.readLine(); while (line != null) { sb.append(line); line = in.readLine(); } } catch (IOException ioe) { ioe.printStackTrace(); } finally { try {in.close();} catch (Exception ignoreMe) {} } String searchResult = sb.toString(); // Look for the individual results by matching the regexp pattern Matcher matcher = pattern.matcher(searchResult); while (matcher.find()) { // Extract the result URL, title and excerpt String resultUrl = URLDecoder.decode(matcher.group(1), "UTF-8"); String resultTitle = matcher.group(2).replaceAll("</?b>", "").replaceAll("", ""); String resultContent = matcher.group(3).replaceAll("</?b>", "").replaceAll(" ", ""); // Create a new YahooResult and add to the list result.add(new YahooResult(resultUrl, resultTitle, resultContent)); } return result; }
public List<YahooResult> search(int page) throws MalformedURLException, URISyntaxException, IOException { // Set the page number and search this.page = page; return search(); }
public List<YahooResult> nextPage() throws MalformedURLException, URISyntaxException, IOException { // Increment the page number and search page++; return search(); }
public List<YahooResult> previousPage() throws MalformedURLException, URISyntaxException, IOException { // Decrement the page number and search; if the page number is 1 return an empty list if (page > 1) { page--; return search(); } else return new ArrayList<YahooResult>(); }
}
class YahooResult {
private URL url; private String title; private String content;
public URL getUrl() { return url; }
public void setUrl(URL url) { this.url = url; }
public void setUrl(String url) throws MalformedURLException { this.url = new URL(url); }
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public String getContent() { return content; }
public void setContent(String content) { this.content = content; }
public YahooResult(URL url, String title, String content) { setUrl(url); setTitle(title); setContent(content); }
public YahooResult(String url, String title, String content) throws MalformedURLException { setUrl(url); setTitle(title); setContent(content); }
@Override public String toString() { StringBuilder sb = new StringBuilder(); if (title != null) { sb.append(",title=").append(title); } if (url != null) { sb.append(",url=").append(url); } return sb.charAt(0) == ',' ? sb.substring(1) : sb.toString(); }
}
public class TestYahooSearch {
public static void main(String[] args) throws MalformedURLException, URISyntaxException, IOException { // Create a new search YahooSearch search = new YahooSearch("Rosetta code"); // Get the search results List<YahooResult> results = search.search(); // Show the search results for (YahooResult result : results) { System.out.println(result.toString()); } }
}</lang>
Kotlin
This is based on the C# entry but uses a regular expression based on what appears to be the Yahoo! format as at the date of this entry (4 December 2017). <lang scala>// version 1.2.0
import java.net.URL
val rx = Regex("""class YahooResult(var title: String, var link: String, var text: String) {
override fun toString() = "\nTitle: $title\nLink : $link\nText : $text"
}
class YahooSearch(val query: String, val page: Int = 0) {
private val content: String
init { val yahoo = "http://search.yahoo.com/search?" val url = URL("${yahoo}p=$query&b=${page * 10 + 1}") content = url.readText() }
val results: MutableList<YahooResult> get() { val list = mutableListOf<YahooResult>() for (mr in rx.findAll(content)) { val title = mr.groups[2]!!.value.replace("", "").replace("", "") val link = mr.groups[1]!!.value val text = mr.groups[3]!!.value.replace("", "").replace("", "") list.add (YahooResult(title, link, text)) } return list }
fun nextPage() = YahooSearch(query, page + 1)
fun getPage(newPage: Int) = YahooSearch(query, newPage)
}
fun main(args: Array<String>) {
for (page in 0..1) { val x = YahooSearch("rosettacode", page) println("\nPAGE ${page + 1} =>") for (result in x.results.take(3)) println(result) }
}</lang> Output (restricted to first three results on first two pages):
PAGE 1 => Title: Rosetta Code - Official Site Link : http://rosettacode.org/wiki/Rosetta_Code Text : Rosetta Code is a programming chrestomathy site. The idea is to present solutions to the same task in as ... Title: Rosetta Code - Wikipedia Link : https://en.wikipedia.org/wiki/Rosetta_Code Text : Rosetta Code is a wiki-based programming chrestomathy website with implementations of common algorithms ... Title: Rosetta Code (@rosettacode) | Twitter Link : https://twitter.com/rosettacode Text : The latest Tweets from Rosetta Code (@rosettacode). Twitter account for http://t.co/DuRZFWDfRn. The ... PAGE 2 => Title: Rosetta Code Blog Link : http://blog.rosettacode.org/ Text : As I noted, there was an expectation of downtime as the VPS hostRosetta Code sits on moved from one data ... Title: Rosetta Code - Wikipedia Link : https://en.wikipedia.org/wiki/User:Paddy3118/Rosetta_Code Text : Rosetta Code is a wiki-based programming chrestomathy website with implementations of common algorithms ... Title: Rosetta Code and ABAP | SAP Blogs Link : https://blogs.sap.com/2015/03/27/rosetta-code-and-abap/ Text : Last week Christian Drumm (@ceedee666) and Fred Verheul (@fredverheul) had a short conversation on ...
Mathematica/Wolfram Language
We cannot define a class in Mathematica, so I generate a "Manipulate" object instead. <lang>Manipulate[
Column[Flatten[ StringCases[ StringCases[ URLFetch[ "http://search.yahoo.com/search?p=" <> query <> "&b=" <>ToString@page], "<ol" ~~ ___ ~~ ""],
"<a" ~~ Shortest[__] ~~ "class=\"yschttl spt\" href=\"" ~~ Shortest[url__] ~~ "\"" ~~ Shortest[__] ~~ ">" ~~ Shortest[title__] ~~"
Column[{Hyperlink[Style[#1, Larger], #2], #3, Style[#2, Smaller]} &@ StringReplace[{title, url, abstr}, {"<" ~~ Shortest[__] ~~ ">" -> "", "&#" ~~ n : DigitCharacter ... ~~ ";" :> FromCharacterCode[FromDigits@n], "&" -> "&", """ -> "\"", "<" -> "<", ">" -> ">"}]]], 1], Spacings -> 2], {{input, "", "Yahoo!"}, InputField[Dynamic@input, String] &}, {{query, ""}, ControlType -> None}, {{page, 1}, ControlType -> None}, Row[{Button["Search", page = 1; query = input], Button["Prev", page -= 10, Enabled -> Dynamic[page >= 10]], Button["Next", page += 10]}]]</lang>
Nim
<lang nim>import httpclient, strutils, htmlparser, xmltree, strtabs const
PageSize = 7 YahooURLPattern = "https://search.yahoo.com/search?fr=opensearch&b=$$#&pz=$#&p=".format(PageSize)
type
SearchResult = ref object url, title, content: string SearchInterface = ref object client: HttpClient urlPattern: string page: int results: array[PageSize+2, SearchResult]
proc newSearchInterface(question: string): SearchInterface =
new result result.client = newHttpClient()
- result.client = newHttpClient(proxy = newProxy(
- "http://localhost:40001")) # only http_proxy supported
result.urlPattern = YahooURLPattern&question
proc search(si: SearchInterface) =
let html = parseHtml(si.client.getContent(si.urlPattern.format( si.page*PageSize+1))) var i: int attrs: XmlAttributes for d in html.findAll("div"): attrs = d.attrs if attrs != nil and attrs.getOrDefault("class").startsWith("dd algo algo-sr relsrch"): let d_inner = d.child("div") for a in d_inner.findAll("a"): attrs = a.attrs if attrs != nil and attrs.getOrDefault("class") == " ac-algo fz-l ac-21th lh-24": si.results[i] = SearchResult(url: attrs["href"], title: a.innerText, content: d.findAll("p")[0].innerText) i+=1 break while i < len(si.results) and si.results[i] != nil: si.results[i] = nil i+=1
proc nextPage(si: SearchInterface) =
si.page+=1 si.search()
proc echoResult(si: SearchInterface) =
for res in si.results: if res == nil: break echo(res[])
var searchInf = newSearchInterface("weather") searchInf.search() searchInf.echoResult() echo("searching for next page...") searchInf.nextPage() searchInf.echoResult() </lang>
- Output:
(url: "https://weather.com/", title: "National and Local Weather Radar, Daily Forecast, Hurricane ...", content: "The Weather Channel and weather.com provide a national and local weather forecast for cities, as well as weather radar, report and hurricane coverage ") (url: "https://weather.com/weather/tenday/l/California+MO?canonicalCityId=d58964aa4d5c9ba2a8e76fe9175052ef5d1ed9ee98eb5514e2b58d67722f7e0e", title: "California, MO 10-Day Weather Forecast - The Weather Channel ...", content: "Be prepared with the most accurate 10-day forecast for California, MO with highs, lows, chance of precipitation from The Weather Channel and Weather.com ") (url: "https://www.accuweather.com/", title: "Local, National, & Global Daily Weather Forecast | AccuWeather", content: "AccuWeather has local and international weather forecasts from the most accurate weather forecasting technology featuring up to the minute weather reports ") (url: "https://www.weather.gov/", title: "National Weather Service", content: "Surface Weather Upper Air Marine and Buoy Reports Snow Cover Satellite Space Weather International Observations. FORECAST Local Forecast International Forecasts Severe Weather Current Outlook Maps Drought Fire Weather Fronts/Precipitation Maps Current Graphical Forecast Maps Rivers Marine Offshore and High Seas Hurricanes Aviation Weather ") (url: "https://www.wunderground.com/", title: "Local Weather Forecast, News and Conditions | Weather Underground", content: "Weather Underground provides local & long-range weather forecasts, weather reports, maps & tropical weather conditions for locations worldwide ") (url: "https://graphical.weather.gov/sectors/missouri.php", title: "NOAA Graphical Forecast for Missouri - National Weather Service", content: "National Weather Service 1325 East West Highway Silver Spring, MD 20910 Page Author: NWS Internet Services Team: Disclaimer Information Quality Credits ... ") (url: "https://www.weatherbug.com/weather-forecast/now/macon-mo-63552", title: "Macon, Missouri | Current Weather Forecasts, Live Radar Maps ...", content: "For more than 20 years Earth Networks has operated the world’s largest and most comprehensive weather observation, lightning detection, and climate networks. We are now leveraging our big data smarts to deliver on the promise of IoT. ") (url: "https://www.accuweather.com/en/us/lakeview-heights-mo/65338/weather-forecast/2107359", title: "Lakeview Heights, MO Today, Tonight & Tomorrow\'s Weather ...", content: "Get the forecast for today, tonight & tomorrow\'s weather for Lakeview Heights, MO. Hi/Low, RealFeel®, precip, radar, & everything you need to be ready for the day, commute, and weekend! ") (url: "https://www.wunderground.com/forecast/us/az/tucson", title: "Tucson, AZ 10-Day Weather Forecast | Weather Underground", content: "Tucson Weather Forecasts. Weather Underground provides local & long-range weather forecasts, weatherreports, maps & tropical weather conditions for the Tucson area. ") searching for next page... (url: "https://forecast.weather.gov/", title: "National Weather Service", content: "NOAA National Weather Service National Weather Service. Widespread Heat This Week; Monsoon Rain Lingers. Widespread heat concerns are expected through at least midweek as high pressure covers a large portion of the U.S., especially the Central half. ") (url: "https://weather.com/weather/tenday/l/Port+Huron+MI?canonicalCityId=772884de37d69a70824031f9ab1202b956e665bdf14a6ffd3257184d64d33351", title: "Port Huron, MI 10-Day Weather Forecast - The Weather Channel ...", content: "Be prepared with the most accurate 10-day forecast for Port Huron, MI with highs, lows, chance of precipitation from The Weather Channel and Weather.com ") (url: "https://www.weather.gov/sgx/", title: "San Diego, CA - National Weather Service", content: "Jul 17, 2021 · NOAA National Weather Service San Diego, CA. Seasonable temperatures will be felt tonight with low cloudiness along the coast and into the valleys with a mostly clear sky inland. ") (url: "https://www.weatherbug.com/weather-forecast/now/houston-tx-77007", title: "Houston, Texas | Current Weather Forecasts, Live Radar Maps ...", content: "For more than 20 years Earth Networks has operated the world’s largest and most comprehensive weather observation, lightning detection, and climate networks. We are now leveraging our big data smarts to deliver on the promise of IoT. ") (url: "https://www.msn.com/en-us/weather", title: "MSN", content: "Sunny. There will be mostly sunny skies. The high will be 103°. Feels Like. 65°. Air Quality. Moderate air ( 51 - 100) Primary pollutant PM2.5 11 μg/m³. 52. ") (url: "https://www.noaa.gov/weather", title: "Weather | National Oceanic and Atmospheric Administration", content: "Jun 04, 2021 · Weather, water and climate events, cause an average of approximately 650 deaths and $15 billion in damage per year and are responsible for some 90 percent of all presidentially-declared disasters. About one-third of the U.S. economy – some $3 trillion – is sensitive to weather and climate. ") (url: "https://www.nbcphiladelphia.com/weather/", title: "NBC10 Philadelphia – Philadelphia News, Local News, Weather ...", content: "Weather stories. severe weather 3 mins ago ‘Destructive Damage\' to Be Added to National Weather Service Cell Phone Alerts weather 5 hours ago Today\'s NBC10 First Alert Forecast ... ")
Oz
Instead of a class the implementation defines a function which returns a lazy list of result pages. This also makes it possible to request e.g. the first and the third page without any resources wasted on an unneeded second page.
We implement some simple parsing with logic programming. Regular expressions in Oz don't seem to support lazy quantification which makes parsing the result pages with them difficult. <lang oz>declare
[HTTPClient] = {Module.link ['x-ozlib://mesaros/net/HTTPClient.ozf']} [StringX] = {Module.link ['x-oz://system/String.ozf']} [Regex] = {Module.link ['x-oz://contrib/regex']}
%% Displays page 1 and 3 of the search results. %% The user can request and display more with context menu->Actions->Make Needed. proc {ExampleUsage} Pages = {YahooSearch "Rosetta code"} in {Inspector.configure widgetShowStrings true} {ForAll {Nth Pages 1} Value.makeNeeded} {ForAll {Nth Pages 3} Value.makeNeeded} %% Display the infinite list of search result pages. {Inspect Pages} end
%% Returns a lazy list of pages. %% A page is a lazy list of entries like this: result(url:U title:T content:C). fun {YahooSearch Query} FetchURL = {CreateURLFetcher} fun {Page Nr}
StartResult = (Nr-1)*10+1 %% only retrieve it when really needed Doc = {Value.byNeed fun {$} {FetchURL "http://search.yahoo.com/search" ["p"#Query "b"#{Int.toString StartResult}]} end} RE = "<a class=\"yschttl spt\" href="
in
%% Lazily returns results. %% In this way it is possible to build the pages list structure %% without creating the single elements %% (e.g. retrieve page 1 and 3 but not 2). for Match in {Regex.allMatches RE Doc} yield:Yield do Xs = {List.drop Doc Match.0.2} in {Yield {ParseEntry Xs}} end
end in for PageNr in 1;PageNr+1 yield:Yield do
{Yield {Page PageNr}}
end end
fun {CreateURLFetcher} Client = {New HTTPClient.cgiGET
init(inPrms(toFile:false toStrm:true) httpReqPrms )}
%% close when no longer used {Finalize.register Client proc {$ C} {C closeAll(true)} end}
fun {FetchURL Url Params}
OutParams
in
{Client getService(Url Params ?OutParams ?_)} OutParams.sOut
end in FetchURL end
%% Xs: String containing HtmL %% Result: "result(url:U title:T content:C)" or "parseError" fun {ParseEntry Xs} proc {Parse Root}
R1 R2 R3 R4 R4 R5 R6 R7 Url = {Fix {QuotedString Xs R1}} {Const ">" R1 R2} Title = {Fix {Until "</a>" R2 R3}}
{Const "choice %% "enchanted" result?
{Const "[] %% result with links into document
{Const "[] %% PDF file
{Const "[] %% With Review
{Const "R6 = nil %% no nice abstract when a review is there [] %% normal result R6 = R4 end Abstract = choice
{Const "[] "" end
in
Root = result(url:Url title:Title content:Abstract)
end in {CondSelect {SearchOne Parse} 1 parseError} end
%% Result: contents of Xs until M is found. %% Xs = {Append M Yr} fun {Until M Xs ?Yr} L R in {List.takeDrop Xs {Length M} L R} if L == M then Yr = R nil elsecase Xs of X|Xr then X|{Until M Xr Yr} [] nil then Yr = nil nil end end
%% Asserts that Xs starts with C. Returns the remainder in Ys. proc {Const C Xs ?Ys} {List.takeDrop Xs {Length C} C Ys} end
%% Assert that a quoted string follows. %% Returns the unquoted string and binds Ys to the remainder of Xs. fun {QuotedString &"|Xs ?Ys} fun {Loop Xs Ys}
case Xs of &\\|&"|Xr then &\\|&"|{Loop Xr Ys} [] &"|Xr then Ys = Xr nil [] X|Xr then X|{Loop Xr Ys} end
end in {Loop Xs Ys} end
%% Remove formatting tags. fun {Fix Xs}{Until "</a>"
{FoldL ["" "" "" " " "..."] fun {$ Ys Z}
{StringX.replace Ys Z ""}
end Xs} _} end
in
{ExampleUsage}</lang>
Perl
<lang perl>package YahooSearch;
use Encode; use HTTP::Cookies; use WWW::Mechanize;
- --- Internals -------------------------------------------------
sub apply (&$)
{my $f = shift; local $_ = shift; $f->(); return $_;}
- We construct a cookie to get 100 results per page and prevent
- "enhanced results".
my $search_prefs = 'v=1&n=100&sm=' .
apply {s/([^a-zA-Z0-9])/sprintf '%%%02X', ord $1/ge} join '|', map {'!' . $_} qw(hsb Zq0 XbM sss dDO VFM RQh uZ0 Fxe yCl GP4 FZK yNC mEG niH);
my $cookies = HTTP::Cookies->new; $cookies->set_cookie(0, 'sB', $search_prefs, '/', 'search.yahoo.com');
my $mech = new WWW::Mechanize
(cookie_jar => $cookies, stack_depth => 0);
sub read_page
{my ($next, $page, @results) = ($mech->find_link(text => 'Next >')->url, decode 'iso-8859-1', $mech->content); while ($page =~ m{
<a \s class="yschttl \s spt" \s
href=" ([^"]+) " \s* > #"
(.+?) </a>
.+?
(.+?) }xg)
{push @results, {url => $1, title => $2, content => $3};
foreach ( @{$results[-1]}{qw(title content)} )
{s/<.+?>//g;
$_ = encode 'utf8', $_;}}
return $next, \@results;}
- --- Methods ---------------------------------------------------
sub new
{my $invocant = shift;
my $class = ref($invocant) || $invocant;
$mech->get('http://search.yahoo.com/search?p=' . apply
{s/([^a-zA-Z0-9 ])/sprintf '%%%02X', ord $1/ge;
s/ /+/g;}
shift);
my ($next, $results) = read_page();
return bless {link_to_next => $next, results => $results}, $class;}
sub results
{@{shift()->{results}};}
sub next_page
{my $invocant = shift;
my $next = $invocant->{link_to_next};
unless ($next)
{$invocant->{results} = [];
return undef;}
$mech->get($next);
($next, my $results) = read_page();
$invocant->{link_to_next} = $next;
$invocant->{results} = $results;
return 1;}</lang>
Phix
<lang Phix>-- demo\rosetta\Yahoo_search_interface.exw
include builtins\libcurl.e
constant glyphs = {{"\xC2\xB7 ","*"}, -- bullet point
{"'",`'`}, -- single quote
{""",`"`}, -- double quote
{"&","&"}, -- ampersand
{"\xE2\x94\xAC\xC2\xAB","[R]"}, -- registered
{"\xC2\xAE","[R]"}}, -- registered
{gutf8,gascii} = columnize(glyphs),
tags = {{`<a `,`</a>`},
{``,``},
{``,``}}
function grab(string txt, opener, closer, integer tdx)
integer openidx = match(opener,txt,tdx)
if openidx=0 then return {0,""} end if
integer closeidx = match(closer,txt,openidx)
txt = txt[openidx+length(opener)..closeidx-1]
tdx = 1
while tdx<=length(tags) do
{opener,closer} = tags[tdx]
integer i = match(opener,txt)
if i=0 then
tdx += 1
else
if opener[$]='>' then
txt[i..i+length(opener)-1] = ""
else
txt[i..find('>',txt,i)] = ""
end if
i = match(closer,txt,i)
txt[i..i+length(closer)-1] = ""
end if
end while
txt = substitute_all(txt,gutf8,gascii)
if length(txt)>80 then txt[78..$] = ".." end if
return {closeidx+length(closer),txt}
end function
procedure YahooSearch(string query, integer page=1)
printf(1,"Page %d:\n=======\n",page)
string url = sprintf("https://search.yahoo.com/search?p=%s&b=%d", {query, (page-1)*10+1})
object res = curl_easy_perform_ex(url)
if not string(res) then
?{"some error",res,curl_easy_strerror(res)}
return
end if
integer rdx = 1
string title, link, desc
while true do
{rdx,title} = grab(res,``,`
`,rdx)
if rdx=0 then exit end if
{rdx,link} = grab(res,``,``,rdx)
{rdx,desc} = grab(res,``,`
`,rdx)
printf(1,"title:%s\nlink:%s\ndesc:%s\n\n",{title,link,desc})
end while
end procedure
YahooSearch("rosettacode")
YahooSearch("rosettacode",2)</lang>
- Output:
Page 1:
=======
title:Rosetta Code
link:rosettacode.org
desc:Jan 29, 2016 * Rosetta Code is a programming chrestomathy site. The idea is t..
title:Rosetta Code - Wikipedia
link:en.wikipedia.org/wiki/Rosetta_Code
desc:Rosetta Code is a wiki -based programming website with implementations of com..
title:@rosettacode | Twitter
link:twitter.com/rosettacode
desc:The latest tweets from @rosettacode
title:Rosetta Code - Simple English Wikipedia, the free encyclopedia
link:simple.wikipedia.org/wiki/Rosetta_Code
desc:Rosetta Code is a wiki-based website that features ways to solve various prog..
<snip>
Page 2:
=======
title:Category:Guile - Rosetta Code
link:rosettacode.org/wiki/Category:Guile
desc:May 30, 2020 * Listed below are all of the tasks on Rosetta Code which have b..
title:Rosetta Code - c2.com
link:wiki.c2.com/?RosettaCode
desc:Rosetta Code is a repository for code examples that go beyond the traditional..
<snip>
title:Rosetta Stone | Discovery, History, & Facts | Britannica
link:www.britannica.com/topic/Rosetta-Stone
desc:Rosetta Stone, ancient Egyptian stone bearing inscriptions in several languag..
PicoLisp
<lang PicoLisp>(load "@lib/http.l")
(de yahoo (Query Page)
(default Page 1)
(client "search.yahoo.com" 80
(pack
"search?p=" (ht:Fmt Query)
"&b=" (inc (* 10 (dec Page))) )
(make
(while (from "<a class=\"yschttl spt\" href=\"")
(link
(make
(link (till "\"" T)) # Url
(from "")
(link (till "<" T)) # Title
(from "class=\"abstr\"")
(from ">")
(link # Content
(pack
(make
(loop
(link (till "<" T))
(T (eof))
(T (= "</div" (till ">" T)))
(char) ) ) ) ) ) ) ) ) ) )</lang>
Output:
: (more (yahoo "test"))
("http://www.test.com/" "Test" "Offers practice online tests for many ...
("http://www.test.com/aboutus.htm" "Test" "Test.com has a successful ...
("http://en.wikipedia.org/wiki/Test" "Test" "YUI Test is a testing ...
("http://en.wikipedia.org/wiki/F-test" "test " "test n. A procedure for ...
...
Python
<lang python>import urllib
import re
def fix(x):
p = re.compile(r'<[^<]*?>')
return p.sub(, x).replace('&', '&')
class YahooSearch:
def __init__(self, query, page=1):
self.query = query
self.page = page
self.url = "http://search.yahoo.com/search?p=%s&b=%s" %(self.query, ((self.page - 1) * 10 + 1))
self.content = urllib.urlopen(self.url).read()
def getresults(self):
self.results = []
for i in re.findall('<a class="yschttl spt" href=".+?">(.+?)</a>
- Output:
title = fix(i[0]) content = fix(i[1]) url = fix(i[2]) self.results.append(YahooResult(title, content, url)) return self.results def getnextpage(self): return YahooSearch(self.query, self.page+1) search_results = property(fget=getresults) nextpage = property(fget=getnextpage)
class YahooResult:
def __init__(self,title,content,url): self.title = title self.content = content self.url = url
- Usage:
x = YahooSearch("test")
for result in x.search_results:
print result.title</lang>
R
Rather than using regexes to find the content (like some of the other solutions here) this method parses the HTML and finds the appropriate sections. <lang R>YahooSearch <- function(query, page=1, .opts=list(), ignoreMarkUpErrors=TRUE) {
if(!require(RCurl) || !require(XML)) { stop("Could not load required packages") } # Replace " " with "%20", etc query <- curlEscape(query) # Retrieve page b <- 10*(page-1)+1 theurl <- paste("http://uk.search.yahoo.com/search?p=", query, "&b=", b, sep="") webpage <- getURL(theurl, .opts=.opts) # Save search for nextpage function .Search <- list(query=query, page=page, .opts=.opts, ignoreMarkUpErrors=ignoreMarkUpErrors) assign(".Search", .Search, envir=globalenv()) # Parse HTML; retrieve results block webpage <- readLines(tc <- textConnection(webpage)); close(tc) if(ignoreMarkUpErrors) { pagetree <- htmlTreeParse(webpage, error=function(...){}) } else { pagetree <- htmlTreeParse(webpage) } findbyattr <- function(x, id, type="id") { ids <- sapply(x, function(x) x$attributes[type]) x[ids==id] } body <- pagetree$children$html$children$body bd <- findbyattr(body$children$div$children, "bd") left <- findbyattr(bd$div$children$div$children, "left") web <- findbyattr(left$div$children$div$children, "web") resol <- web$div$children$ol #Get url, title, content from results gettextfromnode <- function(x) { un <- unlist(x$children) paste(un[grep("value", names(un))], collapse=" ") } n <- length(resol) results <- list() length(results) <- n for(i in 1:n) { mainlink <- resoli$children$div$children[1]$div$children$h3$children$a url <- mainlink$attributes["href"] title <- gettextfromnode(mainlink) contenttext <- findbyattr(resoli$children$div$children[2], "abstr", type="class") if(length(contenttext)==0) { contenttext <- findbyattr(resoli$children$div$children[2]$div$children$div$children, "sm-abs", type="class") } content <- gettextfromnode(contenttext$div) resultsi <- list(url=url, title=title, content=content) } names(results) <- as.character(seq(b, b+n-1)) results
}
nextpage <- function() {
if(exists(".Search", envir=globalenv())) { .Search <- get(".Search", envir=globalenv()) .Search$page <- .Search$page + 1L do.call(YahooSearch, .Search) } else { message("No search has been performed yet") }
}
- Usage
YahooSearch("rosetta code") nextpage()</lang>
Racket
<lang Racket>#lang racket (require net/url) (define *yaho-url* "http://search.yahoo.com/search?p=~a&b=~a") (define *current-page* 0) (define *current-query* "") (define request (compose port->string get-pure-port string->url))
- strip html tags
(define (remove-tags text)
(regexp-replace* #px"<[^<]+?>" text ""))
- search, parse and print
(define (search-yahoo query)
(unless (string=? *current-query* query) ;different query, go back to page 1 (set! *current-query* query) (set! *current-page* 0)) (let* ([current-page (number->string (add1 (* 10 *current-page*)))] [html (request (format *yaho-url* query current-page))][results (regexp-match* #px"lass=\"yschttl spt\" href=\".+?\">(.+?)(.+?).+?
(for ([result (in-list results)]) (printf "Title: ~a \n Link: ~a \n Text: ~a \n\n" (remove-tags (first result)) (remove-tags (second result) ) (remove-tags (third result))))))
- search nexxt page
(define (next-page)
(set! *current-page* (add1 *current-page*)) (search-yahoo *current-query*))</lang>
- REPL:
(search-yahoo "Rosetta") Title: Partner With Our Interactive Marketing Agency Today | Rosetta Link: www.rosetta.com Text: Learn about the fastest growing interactive marketing agency in the country - Rosetta. Our strategic marketing planning is custom built and connects you with your ... Title: Official Rosetta Stone® - Language Learning - Learn a Language Link: www.rosettastone.com Text: Learn a new language with Rosetta Stone®. SUMMER SALE! $349 Levels 1-5 Set + Free Shipping. Ending Soon! Title: Rosetta (spacecraft) - Wikipedia, the free encyclopedia Link: en.wikipedia.org/wiki/Rosetta_probe Text: noun 1. a town in N Egypt, at a mouth of the Nile. 2. a female given name . Relevant Questions Why Was The Rosetta Ston... Who Is Rosetta Stone? Why Is The Rosetta ... Title: Rosetta stone: Definition from Answers.com Link: www.answers.com/topic/rosetta-stone Text: Rosetta stone n. A basalt tablet bearing inscriptions in Greek and in Egyptian hieroglyphic and demotic scripts that w Title: Rosetta (1999) - IMDb Link: www.imdb.com/title/tt0200071 Text: The first scene, like almost all others, is a fighting scene. A girl, about 18, is sacked from her factory work because her trial period is over. The girl, Rosetta ... Title: Welcome to Rosetta Stone® Classroom Link: salem-keizersd.rosettastoneclassroom.com Text: Welcome to Rosetta Stone® Classroom. First Time Users; © 2013 Rosetta Stone Ltd. All rights reserved. Title: Rosetta Hardscapes Link: www.discoverrosetta.com/index.html Text: Rosetta Hardscapes sells and licenses concrete pavers, fire pits, retaining walls and landscaping features with the colors and textures of natural stone. (next-page) Title: Rosetta Stone Link: www.rosettastone.com/?prid=rosettaclassroom_com Text: Find great prices & selection on Rosetta Stone language software for Windows & Mac; shop & buy titles to learn Spanish, French, & more. Title: rosetta stone spanish | eBay - Electronics, Cars, Fashion ... Link: www.ebay.com/sch/i.html?_nkw=rosetta+stone+spanish Text: Find great deals on eBay for rosetta stone spanish and rosetta stone spanish latin america. Shop with confidence. Title: Apple - Rosetta Link: www.apple.com/asia/rosetta Text: Applications bearing the Universal symbol will run natively on both Intel- and PowerPC-based Mac computers. What about the applications you already own? Enter Rosetta. Title: Rosetta | Free Music, Tour Dates, Photos, Videos Link: www.myspace.com/rosetta Text: The International Rosetta Mission was approved in ... Lutetia is revealed by a comprehensive analysis of the data gathered by ESA's Rosetta spacecraft when it ... Title: Amazon.com: rosetta stone Link: www.amazon.com/s?ie=UTF8&page=1&rh=i%3Aaps%2Ck%3Arosetta... Text: Rosetta Stone Spanish (Latin America) Level 1 by Rosetta Stone (CD-ROM - Sept. 14, 2010) - Mac OS X 10.6 Snow Leopard, Windows 7 / 8 / XP. Buy new: $179.00 . Title: Rosetta - Disney Wiki Link: disney.wikia.com/wiki/Rosetta Text: Rosetta: Yea, no, I don't really do mud.Vidia: But, you're a garden fairy!Rosetta: Ironic, isn't... Title: Rosetta - Hamilton, NJ - Company | Facebook Link: www.facebook.com/rosetta Text: Rosetta, Hamilton, NJ. 2,060 likes · 36 talking about this · 135 were here. We are currently ranked by Ad Age among the top US digital agencies and recently named ... Title: Rosetta Stone (Game) - CNET Download.com Link: download.cnet.com/Rosetta-Stone/3000-2111_4-10835868.html Text: Whether your are playing Greek number mode or Egyptian letter mode, the number one rule to keep in mind is keeping the scales balanced but its not that ...
Raku
(formerly Perl 6)
YahooSearch.rakumod:
<lang perl6> use Gumbo; use LWP::Simple; use XML::Text;
class YahooSearch {
has $!dom;
submethod BUILD (:$!dom) { }
method new($term) { self.bless( dom => parse-html( LWP::Simple.get("http://search.yahoo.com/search?p={ $term }") ) ); }
method next { $!dom = parse-html( LWP::Simple.get( $!dom.lookfor( TAG => 'a', class => 'next' ).head.attribs<href> ) ); self; }
method text ($node) { return unless $node; return $node.text if $node ~~ XML::Text;
$node.nodes.map({ self.text($_).trim }).join(' '); }
method results { state $n = 0; for $!dom.lookfor( TAG => 'h3', class => 'title') { given .lookfor( TAG => 'a' )[0] { next unless $_; # No Link next if .attribs<href> ~~ / ^ 'https://r.search.yahoo.com' /; # Ad say "=== #{ ++$n } ==="; say "Title: { .contents[0] ?? self.text( .contents[0] ) !! }"; say " URL: { .attribs<href> }";
my $pt = .parent.parent.parent.elements( TAG => 'div' ).tail; say " Text: { self.text($pt) }"; } } self; }
}
sub MAIN (Str $search-term) is export {
YahooSearch.new($search-term).results.next.results;
} </lang>
And the invocation script is simply:
yahoo-search.raku
use YahooSearch;
So:
raku yahoo-search.raku test
Should give out something like the following: <lang>
#1
Title:
URL: https://www.speedtest.net/ Text: At Ookla, we are committed to ensuring that individuals with disabilities can access all of the content at www.speedtest.net. We also strive to make all content in Speedtest apps accessible. If you are having trouble accessing www.speedtest.net or Speedtest apps, please email legal@ziffdavis.com for assistance. Please put "ADA Inquiry" in the ...
#2
Title: Test | Definition of Test by Merriam-Webster
URL: https://www.merriam-webster.com/dictionary/test Text: Test definition is - a means of testing: such as. How to use test in a sentence.
#3
Title: - Video Results
URL: https://video.search.yahoo.com/search/video?p=test Text: More Test videos
</lang>
...and should go up to result #21!
Ruby
Uses to parse the HTML. Someone more skillful than I at XPath or CSS could tighten up theparse_html
method.
<lang ruby>require 'open-uri' require 'hpricot'
SearchResult = Struct.new(:url, :title, :content)
class SearchYahoo
@@urlinfo = [nil, 'ca.search.yahoo.com', 80, '/search', nil, nil]
def initialize(term) @term = term @page = 1 @results = nil @url = URI::HTTP.build(@@urlinfo) end
def next_result if not @results @results = [] fetch_results elsif @results.empty? next_page end @results.shift end
def fetch_results @url.query = URI.escape("p=%s&b=%d" % [@term, @page]) doc = open(@url) { |f| Hpricot(f) } parse_html(doc) end
def next_page @page += 10 fetch_results end
def parse_html(doc) doc.search("div#main").search("div").each do |div| next unless div.has_attribute?("class") and div.get_attribute("class").index("res") == 0 result = SearchResult.new div.search("a").each do |link| next unless link.has_attribute?("class") and link.get_attribute("class") == "yschttl spt" result.url = link.get_attribute("href") result.title = link.inner_text end div.search("div").each do |abstract| next unless abstract.has_attribute?("class") and abstract.get_attribute("class").index("abstr") result.content = abstract.inner_text end @results << result end end
end
s = SearchYahoo.new("test") 15.times do |i|
result = s.next_result puts i+1 puts result.title puts result.url puts result.content puts
end</lang>
Run BASIC
<lang runbasic>'-------------------------------------------------------------------------- ' send this from the server to the clients browser '--------------------------------------------------------------------------
html "" html "" html "Yahoo Search | ||
Find | "
textbox #find,findThis$,30html " | |
Page | "
textbox #page,findPage$,2html " | |
"
button #s, "Search", [search] html " " button #ex, "Exit", [exit]html " |
wait
'-------------------------------------------------------------------------- ' get search stuff from the clients browser '-------------------------------------------------------------------------- [search] findThis$ = trim$(#find contents$()) findPage$ = trim$(#page contents$()) findPage = max(val(findPage$),1) ' must be at least 1
'-------------------------------------------------------------------------- ' sho page but keep user interface at the top by not clearing the page (cls) ' so they can change the search or page ' ------------------------------------------------------------------------- url$ = "http://search.yahoo.com/search?p=";findThis$;"&b=";((findPage - 1) * 10) + 1 html httpget$(url$) wait
[exit] cls ' clear browser screen and get outta here wait</lang> This user input sits at the top of the yahoo page so they can select a new search or page
Tcl
<lang tcl>package require http
proc fix s {
string map {... "" "" ""[regsub "</a>"" " " ""} \
.*" $s ""]
} proc YahooSearch {term {page 1}} {
# Build the (ugly) scraper URL
append re {<a class="yschttl spt" href=".+?" >(.+?)</a>} append re {
(.+?)}
# Perform the query; note that this handles special characters # in the query term correctly set q [http::formatQuery p $term b [expr {$page*10-9}]] set token [http::geturl http://search.yahoo.com/search?$q] set data [http::data $token] http::cleanup $token
# Assemble the results into a nice list set results {} foreach {- title content url} [regexp -all -inline $re $data] { lappend results [fix $title] [fix $content] [fix $url] }
# set up the call for the next page interp alias {} Nextpage {} YahooSearch $term [incr page]
return $results
}
- Usage: get the first two pages of results
foreach {title content url} [YahooSearch "test"] {
puts $title
} foreach {title content url} [Nextpage] {
puts $title
}</lang>
With Tcl 8.6, more options are available for managing the global state, through objects and coroutines. First, an object-based solution that takes the basic YahooSearch functionality and dresses it up to be more Tcl-like: <lang tcl>package require Tcl 8.6
oo::class create WebSearcher {
variable page term results constructor searchTerm { set page 0 set term $searchTerm my nextPage } # This next method *is* a very Tcl-ish way of doing iteration. method for {titleVar contentsVar urlVar body} { upvar 1 $titleVar t $contentsVar c $urlVar v foreach {t c v} $results { uplevel 1 $body } } # Reuse the previous code for simplicity rather than writing it anew # Of course, if we were serious about this, we'd put the code here properly method nextPage {} { set results [YahooSearch $term [incr page]] return }
}
- How to use. Note the 'foreach' method use below; new "keywords" as methods!
set ytest [WebSearcher new "test"] $ytest for title - url {
puts "\"$title\" : $url"
} $ytest nextPage $ytest for title - url {
puts "\"$title\" : $url"
} $ytest delete ;# standard method that deletes the object</lang> However, the paradigm of an iterator is also interesting and is more appropriately supported through a coroutine. This version conceals the fact that the service produces output in pages; care should be taken with it because it can produce rather a lot of network traffic... <lang tcl>package require Tcl 8.6
proc yahoo! term {
coroutine yahoo![incr ::yahoo] apply {term { yield [info coroutine] while 1 { set results [YahooSearch $term [incr step]] if {[llength $results] == 0} { return -code break } foreach {t c u} $results { yield [dict create title $t content $c url $u] } } }} $term
}
- test by getting first fifty titles...
set it [yahoo! "test"] for {set i 50} {$i>0} {incr i -1} {
puts [dict get [$it] title] after 300 ;# Slow the code down... :-)
}</lang>
Another approach: uses a class as specified in the task. Also, uses an html parser from tcllib (parsing html with regular expressions is a particular annoyance of mine).
<lang tcl>package require Tcl 8.6 package require http package require htmlparse package require textutil::adjust
oo::class create yahoosearch {
method search {s} { my variable searchterm page baseurl set searchterm $s set page 1 set baseurl {http://ca.search.yahoo.com/search} }
method getresults {} { my variable state results current_data set results [list] set current_data [dict create] set state looking_for_results htmlparse::parse -cmd [list [self] html_parser_callback] [my gethtml] }
method nextpage {} { my variable page incr page 10 my getresults } method nextresult {} { my variable results page if { ! [info exists results]} { my getresults } elseif {[llength $results] == 0} { my nextpage } set results [lassign $results result] return $result }
method gethtml {} { my variable searchterm page baseurl set url [format {%s?%s} $baseurl [::http::formatQuery p $searchterm b $page]] set response [http::geturl $url] set html [http::data $response] http::cleanup $response return $html }
method html_parser_callback {tag slash param textBehindTheTag} { my variable state results current_data switch -exact -- $state { looking_for_results { if {$tag eq "div" && [string first {id="main"} $param] != -1} { set state ready } } ready { if {($tag eq "div" && [string first {class="res} $param] != -1) || ($tag eq "html" && $slash eq "/") } { #" -- unbalanced quote disturbs syntax highlighting if {[dict size $current_data] > 0} {lappend results $current_data} set current_data [dict create] set state getting_url } } getting_url { if {$tag eq "a" && [string match "*yschttl spt*" $param]} { if {[regexp {href="(.+?)"} $param - url]} { dict set current_data url $url } else { dict set current_data url "no href in tag params: '$param'" } dict set current_data title $textBehindTheTag set state getting_title } } getting_title { if {$tag eq "a" && $slash eq "/"} { set state looking_for_abstract } else { dict append current_data title $textBehindTheTag } } looking_for_abstract { if {$tag eq "span" && [string first {class="url} $param] != -1} { set state ready } elseif {$tag eq "div" && [string first {class="abstr} $param] != -1} { dict set current_data abstract $textBehindTheTag set state getting_abstract } } getting_abstract { if {$tag eq "div" && $slash eq "/"} { set state ready } else { dict append current_data abstract $textBehindTheTag } } } }
}
yahoosearch create searcher searcher search "search text here"
for {set x 1} {$x <= 15} {incr x} {
set result [searcher nextresult] dict with result { puts $title puts $url puts [textutil::adjust::indent [textutil::adjust::adjust $abstract] " "] puts "" }
}</lang>
TXR
The following gives us a shell utility which we can invoke with arguments like "rosetta 0" to get the first page of search results for "rosetta".
The two arguments are handled as if they were two lines of text from a data source using @(next :args). We throw an exception if there is no match (insufficient arguments are supplied). The @(cases) directive has strictly ordered evaluation, so the throw in the second branch does not happen if the first branch has a successful pattern match. If the similar @(maybe) or @(some) directives were used, this wouldn't work.
A little sprinkling of regex is used.
<lang txr>#!/usr/bin/txr -f @(next :args) @(cases) @ QUERY @ PAGE @(or) @ (throw error "specify query and page# (from zero)") @(end) @(next (open-command "!wget -O - http://search.yahoo.com/search?p=@QUERY\&b=@{PAGE}1 2> /dev/null")) @(all) @ (coll)<a class="yschttl spt" href="@URL" @/[^>]+/>@TITLE</a>@(end) @(and)
@ (coll)
@(end)
@(end) @(output) @ (repeat) TITLE: @TITLE URL: @URL TEXT: @ABSTR --- @ (end) @(end) </lang>
Sample run:
$ ./yahoosearch.txr rosetta 0 TITLE: <b>Rosetta</b> | Partner With Our Interactive <wbr />Marketing Agency Today URL: http://www.rosetta.com/Pages/default.aspx TEXT: Learn about the fastest growing interactive marketing agency in the country - <b>Rosetta</b>. Our strategic marketing planning is custom built and connects you with your ... --- TITLE: Official <b>Rosetta</b> Stone® - Learn a <wbr />Language Online - Language ... URL: http://www.rosettastone.com/ TEXT: <b>Rosetta</b> Stone is the world's #1 language-learning software. Our comprehensive foreign language program provides language learning for individuals and language learning ... --- TITLE: <b>Rosetta</b> (software) - Wikipedia, the <wbr />free encyclopedia URL: http://en.wikipedia.org/wiki/Rosetta_(software) TEXT: Rosettais a lightweight dynamic translatorfor Mac OS Xdistributed by Apple. It enabled applications compiled for the PowerPCfamily of processors to run on Apple systems that use... --- TITLE: <b>Rosetta</b> (spacecraft) - Wikipedia, the <wbr />free encyclopedia URL: http://en.wikipedia.org/wiki/Rosetta_space_probe TEXT: Rosettais a robotic spacecraftof the European Space Agencyon a mission to study the comet 67P/ChuryumovâGerasimenko. <b>Rosetta </b>consists of two main elements: the <b>Rosetta </b>space probeand... --- TITLE: Apple - Mac URL: http://www.apple.com/mac/ TEXT: Discover the world of Mac. Check out MacBook, iMac, iLife, and more. Download QuickTime, Safari, and widgets for free. --- TITLE: <b>Rosetta</b> | Free Music, Tour Dates, <wbr />Photos, Videos URL: http://www.myspace.com/rosetta TEXT: <b>Rosetta</b>'s official profile including the latest music, albums, songs, music videos and more updates. --- TITLE: <b>Rosetta</b> URL: http://rosettaband.com/ TEXT: Metal for astronauts. Philadelphia, since 2003. Contact us at rosettaband@gmail.com Twitter | Facebook --- TITLE: <b>Rosetta</b> URL: http://rosetta.jpl.nasa.gov/ TEXT: The <b>Rosetta</b> spacecraft is on its way to catch and land a robot on a comet! <b>Rosetta</b> will reach comet '67P/Churyumov-Gerasimenko' ('C-G') in 2014. The European Space Agency ... --- TITLE: <b>Rosetta</b> : Multi-script Typography URL: http://rosettatype.com/ TEXT: <b>Rosetta</b> is a new independent foundry with a strong focus on multi-script typography. We are committed to promote research and knowledge in that area and to support ... --- TITLE: <b>Rosetta</b> (1999) - IMDb URL: http://www.imdb.com/title/tt0200071/ TEXT: With Ãmilie Dequenne, Fabrizio Rongione, Anne Yernaux, Olivier Gourmet. Young and impulsive <b>Rosetta</b> lives with her alcoholic mother and, moved by despair, she will ... ---
- Programming Tasks
- Programming environment operations
- Networking and Web Interaction
- AArch64 Assembly
- ARM Assembly
- AutoHotkey
- C sharp
- D
- Gambas
- Go
- GUISS
- Haskell
- Unicon
- Icon Programming Library
- Java
- Java examples needing attention
- Examples needing attention
- Kotlin
- Kotlin examples needing attention
- Mathematica
- Wolfram Language
- Nim
- Oz
- OzHttpClient
- Perl
- Phix
- Phix/libcurl
- PicoLisp
- Python
- R
- RCurl
- XML
- Racket
- Raku
- Gumbo
- LWP-Simple
- Exemel
- Ruby
- Ruby examples needing attention
- RubyGems
- Hpricot
- Run BASIC
- Tcl
- Tcllib
- TXR
- Batch File/Omit
- Brlcad/Omit
- Lilypond/Omit
- Locomotive Basic/Omit
- Lotus 123 Macro Scripting/Omit
- M4/Omit
- Maxima/Omit
- ML/I/Omit
- Openscad/Omit
- PARI/GP/Omit
- PostScript/Omit
- Retro/Omit
- TI-83 BASIC/Omit
- TI-89 BASIC/Omit
- TPP/Omit
- ZX Spectrum Basic/Omit