Web scraping/OCaml

From Rosetta Code
(Redirected from Web Scraping/OCaml)

The content of this page is related to the main page Web Scraping#OCaml

<lang ocaml> let init_socket addr port =

 let inet_addr = (Unix.gethostbyname addr).Unix.h_addr_list.(0) in
 let sockaddr = Unix.ADDR_INET (inet_addr, port) in
 let suck = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 0 in
 Unix.connect suck sockaddr;
 let outchan = Unix.out_channel_of_descr suck in
 let inchan = Unix.in_channel_of_descr suck in
 (inchan, outchan)

let serialize ~post_data =

 String.concat "&"
   (List.map (fun (key, var) -> key ^ "=" ^ var) post_data)

type request = GET | HEAD | POST of (string * string) list

let submit_request ~address ~port ~kind ~path ~referer ~user_agent =

 let req_tag, post_data =
   match kind with
   | GET -> "GET", None
   | HEAD -> "HEAD", None
   | POST data -> "POST", Some data
 in
 let request =
   (Printf.sprintf "%s %s HTTP/1.0\r\n" req_tag path) ^
   (Printf.sprintf "Host: %s\r\n" address) ^
   (match user_agent with None -> "" | Some ua -> Printf.sprintf "User-Agent: %s\r\n" ua) ^
   (match referer with None -> "" | Some referer -> Printf.sprintf "Referer: %s\r\n" referer) ^
   (match post_data with None -> ""
    | Some post_data -> let post_data = serialize ~post_data in
        "Content-type: application/x-www-form-urlencoded\r\n" ^
        "Content-length: "^ string_of_int(String.length post_data) ^"\r\n" ^
        "Connection: close\r\n" ^
        "\r\n" ^
        post_data
   ) ^
   ("\r\n")
 in
 let (inchan, outchan) = init_socket address port in
 output_string outchan request;
 flush outchan;
 (inchan, outchan)

let strip_cr str =

 let len = String.length str in
 let striped = String.create len in
 let rec aux i j =
   if i >= len then j else begin
     if str.[i] <> '\r' then begin
       striped.[j] <- str.[i];
       aux (succ i) (succ j)
     end else begin
       aux (succ i) j
     end
   end
 in
 let nlen = aux 0 0 in
 (String.sub striped 0 nlen)

let cont_of_inchan ?limit ic =

 let first_line = strip_cr(input_line ic) in
 let rec get_header acc =
   try
     let line = input_line ic in
     if line = "\r" || line = ""
     then acc
     else get_header(strip_cr line::acc)
   with End_of_file -> acc
 in
 let header = get_header []
 in
 let buf = Buffer.create 10240 in
 let tmp = String.make 1024 '\000' in
 let rec aux lim =
   let bytes = input ic tmp 0 (min lim 1024) in
   if bytes > 0 then begin
     Buffer.add_substring buf tmp 0 bytes;
     aux (lim - bytes)
   end
 in
 let rec aux_nolim() =
   let bytes = input ic tmp 0 1024 in
   if bytes > 0 then begin
     Buffer.add_substring buf tmp 0 bytes;
     aux_nolim()
   end
 in
 (try
    match limit with
    | Some lim -> aux lim
    | None -> aux_nolim()
  with End_of_file -> ());
 let page = Buffer.contents buf in
 (first_line, header, page)

let cut_url ~url =

 let len = String.length url in
 let (address, len) =
   if len < 7 then (url, len) else
   begin
     let first_7 = String.sub url 0 7 in
     if first_7 = "http://"
     then (String.sub url 7 (len - 7), (len - 7))
     else (url, len)
   end
 in
 let (address, path) =
   try
     let pos = String.index address '/' in
     (String.sub address 0 pos,
      String.sub address (pos) (len - pos))
   with _ ->
     (address, "/")
 in
 (address, path)

let make_request ~url ?(port=80) ?(kind=GET) ?referer ?user_agent () =

 let (address, path) = cut_url ~url in
 let (inchan, outchan) = submit_request ~address ~port ~kind ~path ~referer ~user_agent in
 let cont = cont_of_inchan inchan in
 close_in inchan;
 (cont)

</lang>