;; The first three lines of this file were inserted by DrScheme. They record metadata ;; about the language level of this file in a form that our tools can easily process. #reader(lib "reader.ss" "plai" "lang") ;; We could write code that opens a port, reads in characters, ;; parses the HTML, and turns it all into Scheme lists... or ;; we could use these two libraries. (require ;; Provides sxml:document (planet lizorkin/sxml:2:1/sxml) ;; Provides html->sxml (planet ashinn/html-parser/html-parser) ;; And SchemeUnit (planet schematics/schemeunit:3) (planet schematics/schemeunit:3/text-ui) ) ;; parse :: tree -> list ;; PURPOSE ;; Return a list of URLs. (define (parse exp) ;; ----------------------------------- ;; This is where your code will go. ;; Amazingly, this passes some tests. ;; ----------------------------------- empty) (define (flatten lol) ;; --------------------- ;; Likewise, edit here. ;; --------------------- empty) ;; PURPOSE ;; Takes a URL and runs all your code on it. ;; When you're done with "flatten" and "parse", you ;; should be able to use this on any weblog URL. ;; ;; You may find real-world uglies in data. I look ;; forward to hearing about them. ;; ;; Here is the URL for the course blog: ;; ;; http://www.rockalypse.org/courses/cmpsc220sp09/blog/index.xml ;; ;; It works with my basic parser. (define (uber url) (flatten (parse (get-feed url)))) ;; CONTRACT ;; strip-newlines :: string -> string ;; PURPOSE ;; Uses a regular expression to replace all of the newline characters ;; in a string. We'll want this in our parsing. (define (strip-newlines str) (regexp-replace* #rx"\n" str "")) ;; CONTRACT ;; get-feed :: string -> SXML ;; PURPOSE ;; I have no idea why I wrapped a method of one argument in ;; a method of one argument... this really does nothing. ;; But, it takes a URL, and returns a Schemeified version of ;; the contents of that URL. (define (get-feed url) (sxml:document url)) ;; CONTRACT ;; cleanup :: string -> SXML ;; PURPOSE ;; Cleans up the string by stripping newlines, and ;; then converts it to SchemeXML, or our list-of-lists ;; representation. Used on 'description elements. ;; The resulting data is then able to be fed back ;; into 'parse'. (define (cleanup string) (html->sxml (strip-newlines string))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; TESTS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; FLATTEN TESTS ;; You can probably write more, but this gets you going. ;; For example, what happens if there are empty lists ;; sprinkled within the other lists? (define flatten-suite (test-suite "flatten" (check-equal? empty (flatten empty) "Empty test") (check-equal? (list 1 2 3) (flatten (list 1 2 3)) "Flat list to flat list") (check-equal? (list 1 2 3 4 5 6 7 8 9) (flatten (list (list 1 2 3) (list 4 5 6) (list 7 8 9))) "Hm. Almost soduku."))) ;; PARSE TESTS ;; These are a good starting point. I've included one test ;; that looks brain damaging (involving "description"), but ;; surprisingly that is nearly exactly what I encountered ;; in a real-world test. ;; ;; You might add some other tests that include multiple URLs, ;; etc. I've given you a good start, I think. (define parse-suite (test-suite "parse" (check-equal? empty (flatten (parse empty)) "Empty test") (check-equal? empty (flatten (parse '(p "Hello"))) "No 'href' elements, so the list should be empty.") (check-equal? '("http://www.rockalypse.org/") (flatten (parse '(p (href "http://www.rockalypse.org/")))) "An 'href' element. Should work.") (check-equal? '("http://www.rockalypse.org/") (flatten (parse '(div (p "Some text." (em "More text") "And more.") (p (href "http://www.rockalypse.org/"))))) "A deeper structure, which may test some things in our implementation.") (check-equal? '("http://www.rockalypse.org/") (flatten (parse '(div (p (description) "Some text") (p (description "
")) (p "This is actually a real test case.")))) "An apparently diabolical test case, but I found it in real-world data.") )) (define all-tests (test-suite "All Tests" flatten-suite parse-suite)) (run-tests all-tests)