;; Teachpack for screen-scraping data from Kiva
;; Shriram Krishnamurthi, extended by Kathi Fisler
;; v3: January 2008 (updated to reflect changes in Kiva data format)
;; v2: September 2007 (added exceptions to handle malformed data)
;; v1: August 2007
;; Tested under v372
;;;; Documentation for users ;;;;;;;;;;;;;;;;;;;;;;;;
;; The teachpack exports 2 functions and one constant:
;;
;; (get-kiva-data/page number) produces a list of numbers and
;; strings corresponding to the data on the given page number of
;; the kiva website. Website entries may be omitted if their html
;; was formatted differently from our expectations.
;; This function does not work in versions below 371.
;;
;; (get-kiva-data) produces a list of numbers and strings
;; corresponding to the data on the first page of the kiva website.
;; Website entries may be omitted if their html was formatted
;; differently from our expectations.
;; This function works under Drscheme 370.
;;
;; sample-data is a constant of data in the format coming from the
;; website that students can use to test their functions before
;; connecting to the actual website.
;;;; Documentation for developers ;;;;;;;;;;;;;;;;;;;;;;;;
;; This version demonstrates how we can aggregate data from
;; across multiple lines:
;; ID-NAME-ACTIVITY/R matches the second column (first after the
;; photo) to get the name and activity as well as the ID number
;; (useful for further indexing into site).
;; The LOAN/R regexp matches the second column (loan amount and
;; percentage raised). [NB: percentage is currently computed from
;; the pixel width of the bar, which is not robust. The regexp
;; ought really to get it from the text instead.]
;; The procedure PARSE-ID-NAME-ACTIVITY-LOANS returns matches to
;; these in indiscriminate order, but we expect that they will
;; strictly interleave.
;; COLLATE consumes this interleaving list and concatenates pairs,
;; forming an entry record for each entrepreneur.
;; By extending PARSE-ID-NAME-ACTIVITY-LOANS and COLLATE in parallel,
;; we can support other columns (such as Country/Partner) -- though
;; this is not a very informative line of source (it would have been
;; much easier if they'd used CSS class tags for each entry).
(module kiva-teachpack-nogroups mzscheme
(require (lib "pregexp.ss")
(lib "pretty.ss")
(lib "list.ss")
(lib "url.ss" "net")
(lib "error.ss" "htdp"))
(provide get-kiva-data get-kiva-data/page sample-data)
(define (print-url url)
(display (url-scheme url)) (newline)
(display (url-user url)) (newline)
(display (url-host url)) (newline)
(display (url-port url)) (newline)
(display (url-path-absolute? url)) (newline)
(display (url-path url)) (newline)
(display (url-query url)) (newline)
(display (url-fragment url)) (newline))
;; regexps to match particular shapes of kiva lines
(define id-name/r
(string-append
(pregexp-quote
"
")
"(.+)"
(pregexp-quote "")
))
(define activity/r
(string-append
(pregexp-quote
"\t
")
"(.+)"
))
(define loan/r
(string-append
(pregexp-quote
"$")
"(.+)" ;; grab number possibly including commas
(pregexp-quote ".")
"(.+)" ;;grab cents and sometimes some other data
(pregexp-quote
" ")))
(define country/r
(string-append
(pregexp-quote " ")
"([A-Z][a-z]+)"
(pregexp-quote " ")))
;; parse-id-name-activity-loans : string -> list[number+string]
;; return list of data extracted from given string (representing a line)
(define (parse-id-name-activity-loans l)
(cond
[(pregexp-match-positions id-name/r l)
=>
(lambda (posns)
(let ([id-pos (cadr posns)]
[entrepreneur-pos (caddr posns)])
(list
(string->number (substring l (car id-pos) (cdr id-pos)))
(substring l (car entrepreneur-pos) (cdr entrepreneur-pos))
)))]
[(pregexp-match-positions activity/r l)
=>
(lambda (posns)
(let ([activity-pos (cadr posns)])
(list
(substring l (car activity-pos) (cdr activity-pos)))))]
[(pregexp-match-positions loan/r l)
=>
(lambda (posns)
(let ([total-amt-pos (cadr posns)]
;; caddr is cents and some other stuff -- ignore
[pct-raised-pos (cadddr posns)])
(list
(string->number
(list->string (filter (lambda (c) (not (char=? c #\,)))
(string->list (substring l (car total-amt-pos) (cdr total-amt-pos))))))
(string->number
(substring l (car pct-raised-pos) (cdr pct-raised-pos))))))]
[(pregexp-match-positions country/r l)
=>
(lambda (posns)
(let ([country-posn (cadr posns)])
(list (substring l (car country-posn) (cdr country-posn)))))]
[else #f]))
;; check-format : list[value] number (value -> bool) ...
;; checks whether list has given number of elts matching preds in turn
(define check-format
(lambda (of-list expected-len . eltpreds)
(and (= (length of-list) expected-len)
(andmap (lambda (d f) (f d)) of-list eltpreds))))
;; collate : list[list[string+number]] -> list[string+number]
;; flattens data extracted for one borrower over many lines into one list
(define (collate interleaved)
(let loop ([l interleaved])
(if (null? l)
'()
(if (< (length l) 4)
'() ;; not enough fields to form entry
(if (and (check-format (first l) 2 number? string?)
(check-format (second l) 1 string?)
(check-format (third l) 2 number? number?)
(check-format (fourth l) 1 string?))
(cons (append (first l) (second l) (third l) (fourth l))
(loop (cdddr l)))
(loop (cdr l)))
))))
;; get-all-lines : port -> list[string]
;; return list of all lines available on port
(define (get-all-lines p)
(let loop ()
(let ([l (read-line p)])
(if (eof-object? l)
(begin (close-input-port p) '())
(cons l (loop))))))
;; web-page-data : string -> list[string]
;; read in kiva data from given url (as a string)
(define (web-page-data url-string)
(get-all-lines (get-pure-port (string->url url-string))))
;; file-data : string ->list[string]
;; use to read in data stored in a file with given name
(define (file-data filename)
(get-all-lines (open-input-file filename)))
;; filter-map : (alpha->[beta or #f]) list[alpha] -> list[beta]
;; produces list of results of running first arg on each item in list,
;; throwing away results of false
(define (filter-map f l)
(if (empty? l) l
(with-handlers ([exn:fail? (lambda (exn) (filter-map f (cdr l)))])
(cond
[(f (car l)) =>
(lambda (x) (cons x (filter-map f (cdr l))))]
[else
(filter-map f (cdr l))]))))
;; go : list[string] -> list[list[string+number]]
;; converts list of strings from webpage to lists of data, one list per borrower
(define (go ls)
(collate
(filter-map parse-id-name-activity-loans
ls)))
;; form-kiva-url : number -> string
;; produce string for kiva data page with given id number
;; DOES NOT WORK UNDER 370
(define (form-kiva-url pagenum)
(format "http://www.kiva.org/app.php?page=businesses&nocache=&pageID=~a" pagenum))
;; get-kiva-data/page : number -> list[list[number+string]]
;; produce list of data for borrowers on given kiva page number
;; DOES NOT WORK UNDER 370
(define (get-kiva-data/page pagenum)
(check-arg 'get-kiva-data/page (number? pagenum) 'integer "first" pagenum)
(go (web-page-data (form-kiva-url pagenum))))
;; url that works under 370
(define KIVA-BASE-URL "http://www.kiva.org/app.php?page=businesses")
;; get-kiva-data : -> list[list[number+string]]
;; produce list of data for borrowers on base kiva page
(define (get-kiva-data)
(go (web-page-data KIVA-BASE-URL)))
;; If you keep iterating through these pageID's,
;; non-existent pages have this string we can check for:
;; "No businesses found".
; snapshot of data from Sept 23, 2007 for testing purposes
(define sample-data
'((18864 "ELIZABETH S?NCHEZ S?NCHEZ" "Fish Selling" 575 73 "Peru")
(18725 "Angelina Britez" "Weaving" 1000 27 "Paraguay")
(18859 "VICTOR VIDAL MAYTA QUISPE" "Agriculture" 650 84 "Peru")
(18995 "Ramzieh El Samadi" "Crafts" 1000 32 "Lebanon")
(18898 "Anita Poutoa" "Retail" 300 91 "Samoa")
(18994 "Rouwaida Mattar" "Embroidery" 700 60 "Lebanon")
(18961 "Savriniso Ismoilova" "Retail" 800 87 "Tajikistan")
(18965 "Zulaihon Mamdjonova" "Shoe Sales" 1200 31 "Tajikistan")
(18869 "LINA PRESENTACI?N RE?TEGUI DE L?PEZ" "General Store" 325 30 "Peru")
(18967 "Faiziniso Ahmedova" "Clothing Sales" 1200 18 "Tajikistan")
(18868 "Virgina Siufaga" "Taxi" 300 75 "Samoa")
(18896 "Evo Lo Tam" "Retail" 300 25 "Samoa")
(18982 "Mar?a Sefstran" "Grocery Store" 1000 65 "Paraguay")
(18964 "Valentina Zarubenko" "Clothing Sales" 1000 55 "Ukraine")
(17738 "Eldaniz Usubov" "Spare Parts" 1200 87 "Azerbaijan")
(18962 "Nataliya Fischook" "Retail" 1200 27 "Ukraine")
(18963 "Yuryi Gah" "Taxi" 600 45 "Ukraine")
(17721 "Asif Aliyev" "Farming" 1200 70 "Azerbaijan")
(18792 "Fuad Mammadov" "Farming" 1200 20 "Azerbaijan")
(17734 "Mirsadig Miriyev" "Farming" 1200 66 "Azerbaijan")
(18604 "Erasmo Castillo Ni?o" "Home Products Sales" 1200 93 "Mexico")
(18891 "Ake Faauli" "Retail" 300 50 "Samoa")
(18886 "Tuisugaigoa Ioane" "General Store" 300 41 "Samoa")
(18870 "Vaalele Faamitai" "Taxi" 300 58 "Samoa")
(18842 "Galu Vainalepa" "Services" 500 60 "Samoa")
(17735 "Dayanat Novruzov" "Farming" 1200 35 "Azerbaijan")
(18726 "Concepcion Candia" "Grocery Store" 600 8 "Paraguay")
(18867 "Olivia Mose" "Taxi" 300 41 "Samoa")
(18897 "Taikoke Luaofo" "Retail" 300 8 "Samoa")))
) | |