;; Teachpack for screen-scraping data from Kiva
;; Shriram Krishnamurthi, extended by Kathi Fisler
;; v4: March 2008 (extended to extract number of entrepreneurs in
;; borrowing group -- defaults to 1 if not listed)
;; v3: January 2008 (updated to reflect changes in Kiva data format)
;; v2: September 2007 (added exceptions to handle malformed data)
;; v1: August 2007
;; Tested under v372
;;;; Documentation for users ;;;;;;;;;;;;;;;;;;;;;;;;
;; The teachpack exports 2 functions and one constant:
;;
;; (get-kiva-data/page number) produces a list of numbers and
;; strings corresponding to the data on the given page number of
;; the kiva website. Website entries may be omitted if their html
;; was formatted differently from our expectations.
;; This function does not work in versions below 371.
;;
;; (get-kiva-data) produces a list of numbers and strings
;; corresponding to the data on the first page of the kiva website.
;; Website entries may be omitted if their html was formatted
;; differently from our expectations.
;; This function works under Drscheme 370.
;;
;; sample-data is a constant of data in the format coming from the
;; website that students can use to test their functions before
;; connecting to the actual website.
;;;; Documentation for developers ;;;;;;;;;;;;;;;;;;;;;;;;
;; This version demonstrates how we can aggregate data from
;; across multiple lines:
;; ID-NAME-ACTIVITY/R matches the second column (first after the
;; photo) to get the name and activity as well as the ID number
;; (useful for further indexing into site).
;; The LOAN/R regexp matches the second column (loan amount and
;; percentage raised). [NB: percentage is currently computed from
;; the pixel width of the bar, which is not robust. The regexp
;; ought really to get it from the text instead.]
;; The ENTREPRENEUR/R regexp matches designations that there are
;; multiple entrepreneurs involved in a loan. Some borrowers
;; have this and some do not. The remaining functions use a
;; default value of 1 for this if the HTML did not specify an
;; entrepreneur headcount.
;; The procedure PARSE-ID-NAME-ACTIVITY-LOANS returns matches to
;; these in indiscriminate order, but we expect that they will
;; strictly interleave.
;; COLLATE consumes this interleaving list and concatenates pairs,
;; forming an entry record for each entrepreneur.
;; By extending PARSE-ID-NAME-ACTIVITY-LOANS and COLLATE in parallel,
;; we can support other columns (such as Country/Partner) -- though
;; this is not a very informative line of source (it would have been
;; much easier if they'd used CSS class tags for each entry).
(module kiva-teachpack mzscheme
(require (lib "pregexp.ss")
(lib "pretty.ss")
(lib "list.ss")
(lib "url.ss" "net")
(lib "error.ss" "htdp"))
(provide get-kiva-data get-kiva-data/page sample-data)
(define (print-url url)
(display (url-scheme url)) (newline)
(display (url-user url)) (newline)
(display (url-host url)) (newline)
(display (url-port url)) (newline)
(display (url-path-absolute? url)) (newline)
(display (url-path url)) (newline)
(display (url-query url)) (newline)
(display (url-fragment url)) (newline))
;; regexps to match particular shapes of kiva lines
(define id-name/r
(string-append
(pregexp-quote
"
")
"(.+)"
(pregexp-quote "")
))
(define entrepreneur/r
(string-append
(pregexp-quote "\t
")
"([0-9]+)" ;; grab number
(pregexp-quote " Entrepreneurs")
))
(define activity/r
(string-append
(pregexp-quote
"\t
")
"(.+)"
))
(define loan/r
(string-append
(pregexp-quote
"$")
"(.+)" ;; grab number possibly including commas
(pregexp-quote ".")
"(.+)" ;;grab cents and sometimes some other data
(pregexp-quote
" ")))
(define country/r
(string-append
(pregexp-quote " ")
"([A-Z][a-z]+)"
(pregexp-quote " ")))
;; parse-id-name-activity-loans : string -> list[number+string]
;; return list of data extracted from given string (representing a line)
(define (parse-id-name-activity-loans l)
(cond
[(pregexp-match-positions id-name/r l)
=>
(lambda (posns)
(let ([id-pos (cadr posns)]
[entrepreneur-pos (caddr posns)])
(list
(string->number (substring l (car id-pos) (cdr id-pos)))
(substring l (car entrepreneur-pos) (cdr entrepreneur-pos))
)))]
[(pregexp-match-positions activity/r l)
=>
(lambda (posns)
(let ([activity-pos (cadr posns)])
(list
(substring l (car activity-pos) (cdr activity-pos)))))]
[(pregexp-match-positions entrepreneur/r l)
=>
(lambda (posns)
(let ([entrepreneur-count-pos (cadr posns)])
(list
(string->number
(substring l (car entrepreneur-count-pos) (cdr entrepreneur-count-pos))))))]
[(pregexp-match-positions loan/r l)
=>
(lambda (posns)
(let ([total-amt-pos (cadr posns)]
;; caddr is cents and some other stuff -- ignore
[pct-raised-pos (cadddr posns)])
(list
(string->number
(list->string (filter (lambda (c) (not (char=? c #\,)))
(string->list (substring l (car total-amt-pos) (cdr total-amt-pos))))))
(string->number
(substring l (car pct-raised-pos) (cdr pct-raised-pos))))))]
[(pregexp-match-positions country/r l)
=>
(lambda (posns)
(let ([country-posn (cadr posns)])
(list (substring l (car country-posn) (cdr country-posn)))))]
[else #f]))
;; check-format : list[value] number (value -> bool) ...
;; checks whether list has given number of elts matching preds in turn
(define check-format
(lambda (of-list expected-len . eltpreds)
(and (= (length of-list) expected-len)
(andmap (lambda (d f) (f d)) of-list eltpreds))))
;; collate : list[list[string+number]] -> list[string+number]
;; flattens data extracted for one borrower over many lines into one list
(define (collate interleaved)
(let loop ([l interleaved])
(if (null? l)
'()
(if (< (length l) 4)
'() ;; not enough fields to form entry
(cond
;; entry for individual (no entrepreneur count)
[(and (check-format (first l) 2 number? string?)
(check-format (second l) 1 string?)
(check-format (third l) 2 number? number?)
(check-format (fourth l) 1 string?))
(cons (append (first l) (list 1) (second l) (third l) (fourth l))
(loop (cdddr l)))]
;; entry for group (has entrepreneur count)
[(and (>= (length l) 5)
(check-format (first l) 2 number? string?)
(check-format (second l) 1 number?)
(check-format (third l) 1 string?)
(check-format (fourth l) 2 number? number?)
(check-format (fifth l) 1 string?))
(cons (append (first l) (second l) (third l) (fourth l) (fifth l))
(loop (cdddr l)))]
;; otherwise: misaligned entry
[else (loop (cdr l))])
))))
;; get-all-lines : port -> list[string]
;; return list of all lines available on port
(define (get-all-lines p)
(let loop ()
(let ([l (read-line p)])
(if (eof-object? l)
(begin (close-input-port p) '())
(cons l (loop))))))
;; web-page-data : string -> list[string]
;; read in kiva data from given url (as a string)
(define (web-page-data url-string)
(get-all-lines (get-pure-port (string->url url-string))))
;; file-data : string ->list[string]
;; use to read in data stored in a file with given name
(define (file-data filename)
(get-all-lines (open-input-file filename)))
;; filter-map : (alpha->[beta or #f]) list[alpha] -> list[beta]
;; produces list of results of running first arg on each item in list,
;; throwing away results of false
(define (filter-map f l)
(if (empty? l) l
(with-handlers ([exn:fail? (lambda (exn) (filter-map f (cdr l)))])
(cond
[(f (car l)) =>
(lambda (x) (cons x (filter-map f (cdr l))))]
[else
(filter-map f (cdr l))]))))
;; go : list[string] -> list[list[string+number]]
;; converts list of strings from webpage to lists of data, one list per borrower
(define (go ls)
(collate
(filter-map parse-id-name-activity-loans
ls)))
;; form-kiva-url : number -> string
;; produce string for kiva data page with given id number
;; DOES NOT WORK UNDER 370
(define (form-kiva-url pagenum)
(format "http://www.kiva.org/app.php?page=businesses&nocache=&pageID=~a" pagenum))
;; get-kiva-data/page : number -> list[list[number+string]]
;; produce list of data for borrowers on given kiva page number
;; DOES NOT WORK UNDER 370
(define (get-kiva-data/page pagenum)
(check-arg 'get-kiva-data/page (number? pagenum) 'integer "first" pagenum)
(go (web-page-data (form-kiva-url pagenum))))
;; url that works under 370
(define KIVA-BASE-URL "http://www.kiva.org/app.php?page=businesses")
;; get-kiva-data : -> list[list[number+string]]
;; produce list of data for borrowers on base kiva page
(define (get-kiva-data)
(go (web-page-data KIVA-BASE-URL)))
;; If you keep iterating through these pageID's,
;; non-existent pages have this string we can check for:
;; "No businesses found".
; snapshot of data from Sept 23, 2007 for testing purposes
(define sample-data
'((39526 "Doeun Choen" 1 "Weaving" 700 32 "Cambodia")
(39700 "Raquel Tembe" 1 "Retail" 1125 31 "Mozambique")
(40129 "Maura Mamani Corso" 1 "Food Production/Sales" 650 7 "Bolivia")
(38793 "Nazia Bibi's Group" 5 "Clothing Sales" 190 50 "Pakistan")
(39203 "Keo Heang" 1 "Farming" 1100 22 "Cambodia")
(39323 "Said's Kilemberu C Group" 6 "Charcoal Sales" 204 71 "Tanzania")
(39710 "Doreen Maruma" 1 "Liquor Store / Off-License" 875 28 "Tanzania")
(40141 "Miguel Antonio Rivas Tejada" 1 "Grocery Store" 375 60 "Nicaragua")
(38794 "Richard Katugga" 1 "Goods Distribution" 1175 51 "Uganda")
(39208 "Van Sort" 1 "Grocery Store" 1000 60 "Cambodia")
(39328 "Martha Musa" 1 "Restaurant" 700 46 "Nigeria")
(39547 "Hassan Al Sibaii" 1 "Retail" 1200 72 "Lebanon")
(39713 "Marleny Yanet Llantoy Huaman" 1 "Fruits & Vegetables" 1050 35 "Peru")
(32880 "Shell II 2085(a) Group" 5 "Home Products Sales" 320 42 "Uganda")
(38798 "Nighat Bibi's Group" 5 "Retail" 280 30 "Pakistan")
(39210 "Chea Navy" 1 "Grocery Store" 1000 7 "Cambodia")
(39329 "Fumi Afolabi" 1 "Home Products Sales" 450 33 "Nigeria")
(39552 "Amal Nahle" 1 "Clothing Sales" 1000 55 "Lebanon")
(36428 "Mariana Intriago" 1 "Food Market" 1200 37 "Ecuador")
(38994 "Victor Hugo Suxo Martinez" 1 "Furniture Making" 1100 4 "Bolivia")
(39240 "Mariam's Nyota Group" 4 "Clothing Sales" 175 0 "Tanzania")
(39381 "Lila Salazar De Bardales" 1 "Catering" 700 64 "Peru")
(39592 "Xiomara Lissette Mondrag?n" 1 "Clothing Sales" 1200 72 "Nicaragua")
(40086 "Julia Ticona Bautista" 1 "Home Products Sales" 500 65 "Bolivia")
(38388 "Nuvith Garcia Torres" 1 "General Store" 450 88 "Peru")
(38995 "Valentina Condori Escobar" 1 "Clothing Sales" 1000 0 "Bolivia")
))
) | |