summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore4
-rw-r--r--bwoerter.lisp486
2 files changed, 490 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f4593fa
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+# Exclude the actual data since it's huge and can be downloaded separately.
+/words.sqlite
+/data/
+/xml/
diff --git a/bwoerter.lisp b/bwoerter.lisp
new file mode 100644
index 0000000..c4d7520
--- /dev/null
+++ b/bwoerter.lisp
@@ -0,0 +1,486 @@
+;;; bwoerter.lisp
+;;; Read debate text from XML and make graphs of word frequency over time.
+
+;;; Author: Timo Wilken
+;;; Licence: GPL3
+
+;;; Using data from https://www.bundestag.de/services/opendata
+;;; This tool expects XML files from the above website to be stored in ./xml/,
+;;; next to their DTD.
+
+;;; TODO: Draw axis scales on plots.
+;;; TODO: Handle n-grams.
+;;; - read them into the DB as multi-word sequences?
+;;; - store locations of every word and reassemble n-grams query-by-query?
+
+;;; Required packages
+(ql:quickload '(:fxml :fxml/xpath :sqlite :sxql :ltk :bt-semaphore))
+
+
+;;; Words database
+
+;; SQLite syntax.
+(setq sxql:*quote-character* #\")
+
+(defparameter *db* (sqlite:connect "words.sqlite")
+ "Database connection.")
+(defparameter *min-year* nil
+ "Earliest year with any data.")
+(defparameter *max-year* nil
+ "Latest year with any data.")
+
+(defun update-year-range ()
+ "Update *MIN-YEAR* and *MAX-YEAR* to match year range stored in *DB*."
+ (multiple-value-bind (min max)
+ (sqlite:execute-one-row-m-v
+ *db* (sxql:yield (sxql:select ((:min :year) (:max :year)) (sxql:from :words))))
+ (psetq *min-year* min
+ *max-year* max)))
+
+(handler-case (update-year-range)
+ ;; If the database doesn't exist, this will throw an error. Ignore it for now.
+ (sqlite:sqlite-error () nil))
+
+(defun skip-non-word-chars (input-stream)
+ "Read from INPUT-STREAM up to, but not including, the next word character.
+Always returns NIL. No effect at end of file."
+ (handler-case
+ (do ((c (read-char input-stream) (read-char input-stream)))
+ ((alpha-char-p c) (unread-char c input-stream)))
+ (end-of-file () nil)))
+
+(defun read-word (input-stream)
+ "Read and return a run of word characters from INPUT-STREAM.
+Swallows any leading non-word characters and one trailing non-word character
+from INPUT-STREAM. If INPUT-STREAM is at end-of-stream, return NIL."
+ (let ((word (make-array 0 :element-type 'character :adjustable t)))
+ (do ((c (read-char input-stream nil nil) (read-char input-stream nil nil)))
+ (nil) ; loop until (return)
+ (cond
+ ;; got a word char: accumulate into word
+ ((and (characterp c) (alpha-char-p c))
+ (vector-push-extend c word))
+ ;; got a non-word char or EOF but we have an accumulated word: return it
+ ((> (array-total-size word) 0)
+ (return word))
+ ;; got EOF but no word accumulated: return nil
+ ((not (characterp c))
+ (return nil))
+ ;; got a non-word char but haven't seen any word chars yet: carry on
+ (t nil)))))
+
+(defun parse-xml (xml-file-pathname)
+ "Parse the debate's year and full text from the file at XML-FILE-PATHNAME."
+ (let ((doc (fxml:parse xml-file-pathname (fxml-dom:make-dom-builder)
+ :root "DOKUMENT" :forbid-external nil)))
+ (values (parse-integer (xpath:string-value (xpath:evaluate "//DATUM" doc))
+ :start 6 :end 10)
+ (xpath:string-value (xpath:evaluate "//TEXT" doc)))))
+
+(defun count-words (text &optional (normalize (lambda (word) word)))
+ "Return a hash-table of strings (words) to their counts in TEXT."
+ (let ((stream (make-string-input-stream text))
+ (counts (make-hash-table :test #'equal :size 1000)))
+ (do ((word (read-word stream) (read-word stream)))
+ ((null word) counts)
+ (incf (gethash (funcall normalize word) counts 0)))))
+
+(defmacro db-operation (operation-func &body sxql-statements)
+ "Convert SXQL-STATEMENTS to SQL and pass that OPERATION-FUNC."
+ (let ((cur-stmt (gensym))
+ (cur-args (gensym))
+ (sxql-stmt (gensym)))
+ `(sqlite:with-transaction *db*
+ (dolist (,sxql-stmt (list . ,sxql-statements))
+ (multiple-value-bind (,cur-stmt ,cur-args) (sxql:yield ,sxql-stmt)
+ (apply #',operation-func *db* ,cur-stmt ,cur-args))))))
+
+(defun create-word-database ()
+ "Create a blank database for word counts in *DB*."
+ (db-operation sqlite:execute-non-query
+ (sxql:drop-table :years :if-exists t)
+ (sxql:drop-table :words :if-exists t)
+ (sxql:create-table :years
+ ((year :type 'integer :not-null t :primary-key t)
+ (total-words :type 'integer)))
+ (sxql:create-table :words
+ ((year :type 'integer :not-null t)
+ (word :type 'string :not-null t)
+ (count :type 'integer :not-null t)
+ (count-per-100k :type 'float))
+ (sxql:primary-key '(:year :word)))))
+
+(defun make-insert-query (year word-counts)
+ "Create a SQLite query to insert WORD-COUNTS into *DB*, all in YEAR.
+Return two values: the query string and a list of parameters."
+ (let* (query-args
+ (query-start
+ (string-right-trim
+ "," (with-output-to-string (stream)
+ (princ "INSERT INTO \"words\" (\"year\", \"word\", \"count\") VALUES" stream)
+ (flet ((extend-query (word count)
+ (setq query-args (list* year word count query-args))
+ (princ "(?,?,?)," stream)))
+ (maphash #'extend-query word-counts)))))
+ (query-end " ON CONFLICT (\"year\", \"word\") DO UPDATE SET \"count\" = \"count\" + \"excluded\".\"count\""))
+ (values (concatenate 'string query-start query-end) query-args)))
+
+(defun count-all-files-into-db ()
+ "Loop through XML files, counting words and writing them to *DB*."
+ (block count
+ (let ((xmlfiles (directory (make-pathname :name :wild :type "xml" :defaults
+ (truename (make-pathname :directory '(:relative "xml"))))))
+ (num-scanned 0))
+ (gui-notify-start-scan (length xmlfiles))
+ (dolist (xmlfn xmlfiles)
+ (multiple-value-bind (year text) (parse-xml xmlfn)
+ ;; Don't use sqxl here because doing everything in a single query is
+ ;; much more efficient here (143.1 sec/file --> 0.2 sec/file)!
+ (multiple-value-bind (query args)
+ ;; NOTE: [n]string-*case don't change ß and ẞ, but handle äöü fine.
+ (make-insert-query year (count-words text #'nstring-downcase))
+ (apply #'sqlite:execute-non-query *db* query args))
+
+ ;; We're in another thread, the main window might be closed out from under us.
+ (when ltk:*exit-mainloop*
+ (return-from count))
+ (gui-notify-files-scanned (incf num-scanned) year))
+ (bt:thread-yield)))
+ (gui-notify-end-scan)))
+
+(defun update-global-db-counts ()
+ "Update yearly total word counts and normalised word counts in *DB*."
+ (db-operation sqlite:execute-non-query
+ ;; Recalculate total word counts for each year.
+ (sxql:delete-from :years)
+ (sxql:insert-into :years (:year :total-words)
+ (sxql:select (:year (:as (:sum :count) :total-words))
+ (sxql:from :words)
+ (sxql:group-by :year)))
+ ;; Update normalised word counts -- we can do this now that we have yearly totals.
+ (sxql:update :words
+ (sxql:set= :count-per-100k
+ (:/ :count 1e-5 (sxql:fields (sxql:select :years.total-words
+ (sxql:from :years)
+ (sxql:where (:= :years.year :words.year)))))))))
+
+(defun recreate-database ()
+ "Count words in XML files, overwriting database."
+ (create-word-database)
+ (count-all-files-into-db)
+ (update-global-db-counts)
+ (update-year-range))
+
+
+;;; Querying
+
+(defparameter *whitespace* '(#\space #\tab #\newline #\return #\page)
+ "Whitespace characters to be trimmed from entered terms.")
+
+(defun split-by-character (string separator)
+ "Split STRING into components separated by the single character SEPARATOR."
+ ;; http://cl-cookbook.sourceforge.net/strings.html#reverse
+ (loop for i = 0 then (1+ j)
+ as j = (position separator string :start i)
+ collect (subseq string i j)
+ while j))
+
+(defun nvector-sum (vector1 &rest vectors)
+ "Add VECTORS to VECTOR1 element-wise, altering and returning VECTOR1.
+All VECTORS should have the same `length' as VECTOR1."
+ (dolist (vector vectors vector1)
+ (dotimes (i (min (length vector1) (length vector)))
+ (incf (aref vector1 i) (aref vector i)))))
+
+(defun vector-sum (vector1 &rest vectors)
+ "Add VECTORS to VECTOR1 element-wise, creating and returning a new vector.
+The returned vector will have the same `array-element-type' and `length' as
+VECTOR1. All VECTORS should have the same `length' as VECTOR1."
+ (apply #'nvector-sum
+ (make-array (length vector1) :element-type (array-element-type vector1) :initial-contents vector1)
+ vectors))
+
+(defmacro apply-partially (function &rest initial-args)
+ "Call FUNCTION with INITIAL-ARGS and args passed when the result of this macro is called.
+((apply-partially FUNCTION INITIAL ARGS ...) LAMBDA ARGS ...)
+== (FUNCTION INITIAL ARGS ... LAMBDA ARGS ...)"
+ (let ((other-args (gensym)))
+ `(lambda (&rest ,other-args)
+ (apply ,function ,@initial-args ,other-args))))
+
+(defun parse-terms-entry (query)
+ "Parse QUERY, separating into groups of terms on commas and terms on pluses.
+*WHITESPACE* characters are trimmed from terms. Empty terms and groups are ignored.
+Example: (parse-terms-entry \"a + B, ,c,\") => ((\"a\" \"b\") (\"c\"))"
+ (mapcar
+ (apply-partially #'mapcar #'nstring-downcase)
+ (flet ((split-into-terms (group)
+ (delete-if (lambda (term) (zerop (length term)))
+ (mapcar (apply-partially #'string-trim *whitespace*)
+ (split-by-character group #\+)))))
+ (delete-if #'null (mapcar #'split-into-terms (split-by-character query #\,))))))
+
+(defun get-plot-data (words &key (start-year *min-year*) (end-year *max-year*))
+ "Retrieve normalised word count data for WORDS from *DB*.
+Return a hash-table mapping a word to an array with each word's normalized count
+in year (+ START-YEAR n) at position n."
+ (let ((hash (make-hash-table :test #'equal :size (list-length words)))
+ (stmt (sqlite:prepare-statement
+ *db* (concatenate
+ 'string
+ "SELECT (\"year\" - ?) AS \"year-idx\", \"word\", \"count-per-100k\" FROM \"words\" "
+ "WHERE \"year\" >= ? AND \"year\" <= ? AND \"word\" IN ("
+ (string-right-trim "," (with-output-to-string (stream)
+ (dotimes (i (list-length words))
+ (princ "?," stream))))
+ ")"))))
+
+ ;; Prepopulate the result hash-table with zeros.
+ (let ((number-of-years (1+ (- end-year start-year))))
+ (dolist (word words)
+ (setf (gethash word hash)
+ (make-array number-of-years :element-type 'double-float :initial-element 0d0))))
+
+ ;; Bind prepared statement parameters.
+ (let ((i 0))
+ (dolist (param (list* start-year start-year end-year words))
+ ;; Parameter indices start at 1, for some reason.
+ (sqlite:bind-parameter stmt (incf i) param)))
+
+ (do ((have-another? (sqlite:step-statement stmt) (sqlite:step-statement stmt)))
+ ((not have-another?)
+ ;; Return the completed hash-table.
+ (prog1 hash (sqlite:finalize-statement stmt)))
+ ;; Enter query results into the hash-table.
+ (let ((year-idx (sqlite:statement-column-value stmt 0))
+ (word (sqlite:statement-column-value stmt 1))
+ (count (sqlite:statement-column-value stmt 2)))
+ (unless (null count)
+ (incf (aref (gethash word hash) year-idx) count))))))
+
+
+;;; Plotting
+
+(defparameter *plot-colors* #("red" "green" "blue" "magenta" "cyan" "yellow")
+ "Colors to use to draw plots for each group of words, in order.")
+(defparameter *plot-x-label* "Jahr"
+ "Plot's horizontal axis label.")
+(defparameter *plot-y-label* "Häufigkeit / 100.000"
+ "Plot's vertical axis label.")
+(defparameter *terms-entry* nil
+ "Text entry widget for search terms.")
+(defparameter *plot* nil
+ "Canvas widget to contain plots.")
+
+(defun y-axis-ticks (axis-min axis-max)
+ "Return tick values for Y axis values between AXIS-MIN and AXIS-MAX.
+The axis range can be obtained from the minimum (first) and maximum (last) ticks."
+ (let ((tick-increment (expt 10 (ffloor (log (max (abs axis-max) (abs axis-min)) 10))))
+ ticks)
+ (do ((tick-value (- axis-max (nth-value 1 (fceiling axis-max tick-increment)))
+ (- tick-value tick-increment)))
+ ((< tick-value axis-min)
+ (cons tick-value ticks))
+ (push tick-value ticks))))
+
+(defmacro with-canvas-items (canvas names itemspecs &body forms)
+ "Draw ITEMSPECS on CANVAS, bind the items to NAMES and run FORMS.
+NAMES should be a list of symbols suitable for passing to `destructuring-bind'.
+FORMS may refer to individual NAMES. For documentation on ITEMSPECS, see
+`ltk::create-item-command'."
+ `(destructuring-bind ,names (ltk:make-items ,canvas ,itemspecs)
+ . ,forms))
+
+(defun plot-draw-axes (canvas x-min x-max y-min y-max)
+ "Draw plot axes on CANVAS, returning the bbox where plot lines should be drawn.
+The return value is a list in (LEFT TOP RIGHT BOTTOM) format; all values in
+pixels, measured from the top-left canvas edge."
+ (let ((w (ltk:window-width canvas))
+ (h (ltk:window-height canvas))
+ (outer-pad 10) ; padding around whole plot
+ (inner-pad 2)) ; padding between plot components
+ ;; Draw axis labels first.
+ (with-canvas-items canvas (x-label y-label)
+ `((:text ,(/ w 2) ,(- h outer-pad) ,*plot-x-label* :anchor :s :justify :center)
+ ;; :anchor is relative to the item; :anchor :n :angle 90 == :anchor :w :angle 0
+ (:text ,outer-pad ,(/ h 2) ,*plot-y-label* :anchor :n :justify :center :angle 90))
+ (let* ((above-x-label (- (cadr (ltk:bbox x-label)) inner-pad))
+ (right-of-y-label (+ (caddr (ltk:bbox y-label)) inner-pad))
+ ;; Assume x value labels will be as tall as x axis label (XXX: is this valid?)
+ (text-height (- (cadddr (ltk:bbox x-label)) (cadr (ltk:bbox x-label))))
+ (above-x-axis (- above-x-label inner-pad text-height inner-pad))
+ y-axis-labels)
+ ;; Draw y value labels first, as we know how tall x value labels will be.
+ ;; TODO
+ (dolist (tick-value (y-axis-ticks y-min y-max))
+ (let ((canvas-y ))
+ (push `(:text ,right-of-y-label ,canvas-y (write-to-string tick-value) :anchor :w) y-axis-labels)))
+ ;; Now that we know y value labels' width, draw x value labels in 10-year increments.
+ ;; TODO
+
+ ;; Draw axis arrows last.
+ (with-canvas-items canvas (x-axis y-axis)
+ `((:line ,right-of-y-label ,above-x-label ,(- w outer-pad) ,above-x-label :arrow :last)
+ (:line ,right-of-y-label ,above-x-label ,right-of-y-label ,outer-pad :arrow :last))
+ (let ((above-x-axis (- (cadr (ltk:bbox x-axis)) inner-pad))
+ (right-of-y-axis (+ (caddr (ltk:bbox y-axis)) inner-pad)))
+ ;; Return drawable plot area.
+ (list right-of-y-axis outer-pad (- w outer-pad) above-x-axis)))))))
+
+(defun plot-line-coords (canvas draw-area y-min y-max raw-y-data color)
+ "Draw a line with COLOR inside DRAW-AREA on CANVAS, plotting RAW-Y-DATA between Y-MIN and Y-MAX."
+ (unless (= y-min y-max)
+ (destructuring-bind (draw-left draw-top draw-right draw-bottom) draw-area
+ (let* ((plot-area-w (- draw-right draw-left))
+ (plot-area-h (- draw-bottom draw-top))
+ (number-of-years (- *max-year* *min-year*))
+ (coords (make-array (* 2 (1+ number-of-years))
+ :element-type 'float :fill-pointer 0 :initial-element 0f0)))
+ (dotimes (year-idx (length raw-y-data))
+ (vector-push (+ draw-left (* plot-area-w year-idx (/ (max 1 number-of-years)))) coords)
+ (vector-push (- draw-bottom (* plot-area-h (- (aref raw-y-data year-idx) y-min)
+ (/ (- y-max y-min))))
+ coords))
+ ;; If fewer than two years are plotted, lines don't have enough coords.
+ (do ((extra-year-idx (length raw-y-data) (1+ extra-year-idx)))
+ ((>= (length coords) 4))
+ (vector-push-extend (+ draw-left (* plot-area-w extra-year-idx
+ (/ (max 1 number-of-years))))
+ coords)
+ (vector-push-extend draw-bottom coords))
+ (ltk:configure
+ (ltk:make-line canvas coords)
+ :width 1 :fill color :activefill color :disabledfill color)))))
+
+(defun on-entry-update ()
+ "Update the graph when the search text field is changed."
+ (let* ((word-groups (parse-terms-entry (ltk:text *terms-entry*)))
+ (words-data (get-plot-data (apply #'concatenate 'list word-groups)))
+ (groups-data (mapcar (lambda (group)
+ (apply #'vector-sum
+ (mapcar (lambda (word)
+ (gethash word words-data))
+ group)))
+ word-groups))
+ (plot-min (reduce #'min groups-data :key (apply-partially #'reduce #'min)
+ :initial-value 0))
+ (plot-max (reduce #'max groups-data :key (apply-partially #'reduce #'max)
+ :initial-value 1/1000)))
+ (ltk:clear *plot*)
+ (let ((draw-area (plot-draw-axes *plot* *min-year* *max-year* plot-min plot-max)))
+ (dotimes (i (length groups-data))
+ (plot-line-coords *plot* draw-area plot-min plot-max (pop groups-data)
+ (aref *plot-colors* (mod i (length *plot-colors*))))))))
+
+
+;;; LTk GUI
+
+(defparameter *scan-progress* nil
+ "The progress bar widget showing scan progress.")
+(defparameter *scan-label* nil
+ "A small label showing scan progress.")
+
+(defun gui-notify-start-scan (num-files)
+ "Show scan progress bar with a maximum value of NUM-FILES."
+ (when *scan-progress*
+ (ltk:configure *scan-progress* :mode :determinate :value 0 :maximum num-files)
+ ;; Use remembered grid settings.
+ (ltk:format-wish "grid ~a" (ltk:widget-path *scan-progress*)))
+ (when *scan-label*
+ (setf (ltk:text *scan-label*) "Scan (0):")
+ ;; Use remembered grid settings.
+ (ltk:format-wish "grid ~a" (ltk:widget-path *scan-label*))))
+
+(defun gui-notify-files-scanned (scanned cur-year)
+ "Notify the interface of an updated number of files scanned."
+ (when *scan-progress*
+ (ltk:configure *scan-progress* :value scanned))
+ (when *scan-label*
+ (setf (ltk:text *scan-label*) (with-output-to-string (s)
+ (format s "Scan (~a):" cur-year)))))
+
+(defun gui-notify-end-scan ()
+ "Hide scan progress bar."
+ (when *scan-progress*
+ (ltk:configure *scan-progress* :mode :indeterminate)
+ ;; This remembers grid settings for later use in `gui-notify-start-scan'.
+ (ltk:format-wish "grid remove ~a" (ltk:widget-path *scan-progress*)))
+ (when *scan-label*
+ (setf (ltk:text *scan-label*) "(kein Scan)")
+ ;; This remembers grid settings for later use in `gui-notify-start-scan'.
+ (ltk:format-wish "grid remove ~a" (ltk:widget-path *scan-label*))))
+
+(defmacro grid (&body forms)
+ "Rewrite FORMS into a series of `ltk:grid' calls.
+FORMS is a list of the format (R C WIDGET . EXTRA), rewritten into calls of the
+form (ltk:grid ,WIDGET ,R ,C ,@defaults ,@EXTRA) for some sensible defaults."
+ (let (out)
+ (dolist (form (reverse forms) `(progn . ,out))
+ (destructuring-bind (row column widget &rest extra) form
+ (push `(ltk:grid ,widget ,row ,column :sticky :nsew :padx 5 :pady 5 . ,extra) out)))))
+
+(defmacro make-thread-with-saved (vars &body thread-body)
+ "Call `bt:make-thread', but saving VARS from reassignment inside the thread."
+ (let ((tmps (mapcar (lambda (_) (declare (ignore _)) (gensym)) vars)))
+ `(let ,(mapcar #'list tmps vars)
+ (bt:make-thread (lambda ()
+ (let ,(mapcar #'list vars tmps)
+ . ,thread-body))))))
+
+(defun gui-main ()
+ "Build and show GUI."
+ (ltk:with-ltk (:debug :develop)
+ (let ((top (make-instance 'ltk:frame)))
+ (psetq *terms-entry* (make-instance 'ltk:entry :master top :validate :key)
+ *plot* (make-instance 'ltk:canvas :master top :background "white")
+ *scan-progress* (make-instance 'ltk:progressbar :master top :mode :indeterminate)
+ *scan-label* (make-instance 'ltk:label :master top :text "Scan:"))
+
+ (ltk:pack top :fill :both :expand t)
+ (ltk:grid-columnconfigure top 1 :weight 1)
+ (ltk:grid-rowconfigure top 3 :weight 1)
+
+ ;; The default LTk handler for :validatecommand uses invalid syntax
+ ;; (missing the "return").
+ (ltk:format-wish
+ "~a configure -validatecommand {callback ~a; return 1}"
+ (ltk:widget-path *terms-entry*)
+ (let ((name (ltk::name *terms-entry*)))
+ (ltk::add-callback (ltk::name *terms-entry*) #'on-entry-update)
+ name))
+
+ (let ((toolbar (make-instance 'ltk:frame :master top))
+ bg-thread)
+ (flet ((close-toplevel ()
+ (flet ((exit-main () (setq ltk:*exit-mainloop* t)))
+ (when (and (bt:threadp bg-thread) (bt:thread-alive-p bg-thread))
+ ;; Notify background thread that we're closing.
+ (bt:interrupt-thread bg-thread #'exit-main))
+ (exit-main)))
+ (mkdb-in-thread ()
+ (unless (and (bt:threadp bg-thread) (bt:thread-alive-p bg-thread))
+ (setq bg-thread (make-thread-with-saved (ltk:*wish* ltk:*exit-mainloop*)
+ (recreate-database))))))
+
+ (grid
+ (0 0 toolbar :columnspan 2)
+ (0 0 (make-instance 'ltk:button
+ :master toolbar :text "Schließen"
+ :command #'close-toplevel))
+ (0 1 (make-instance 'ltk:button
+ :master toolbar :text "Daten neu einlesen"
+ :command #'mkdb-in-thread))
+ (2 0 (make-instance 'ltk:label :master top :text "Suche:"))
+ (2 1 *terms-entry*)
+ (3 0 *plot* :columnspan 2)
+ (1 0 *scan-label*)
+ (1 1 *scan-progress*)))))
+
+ ;; Plotting functions need non-`nil' `*min-year*' and `*max-year*'.
+ (when (or (null *min-year*) (null *max-year*))
+ (ltk:configure *terms-entry* :state :disabled))
+
+ ;; Hide scan progress bar until a scan is started.
+ (gui-notify-end-scan)))
+
+(gui-main)