diff options
| -rw-r--r-- | .gitignore | 4 | ||||
| -rw-r--r-- | bwoerter.lisp | 486 |
2 files changed, 490 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f4593fa --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +# Exclude the actual data since it's huge and can be downloaded separately. +/words.sqlite +/data/ +/xml/ diff --git a/bwoerter.lisp b/bwoerter.lisp new file mode 100644 index 0000000..c4d7520 --- /dev/null +++ b/bwoerter.lisp @@ -0,0 +1,486 @@ +;;; bwoerter.lisp +;;; Read debate text from XML and make graphs of word frequency over time. + +;;; Author: Timo Wilken +;;; Licence: GPL3 + +;;; Using data from https://www.bundestag.de/services/opendata +;;; This tool expects XML files from the above website to be stored in ./xml/, +;;; next to their DTD. + +;;; TODO: Draw axis scales on plots. +;;; TODO: Handle n-grams. +;;; - read them into the DB as multi-word sequences? +;;; - store locations of every word and reassemble n-grams query-by-query? + +;;; Required packages +(ql:quickload '(:fxml :fxml/xpath :sqlite :sxql :ltk :bt-semaphore)) + + +;;; Words database + +;; SQLite syntax. +(setq sxql:*quote-character* #\") + +(defparameter *db* (sqlite:connect "words.sqlite") + "Database connection.") +(defparameter *min-year* nil + "Earliest year with any data.") +(defparameter *max-year* nil + "Latest year with any data.") + +(defun update-year-range () + "Update *MIN-YEAR* and *MAX-YEAR* to match year range stored in *DB*." + (multiple-value-bind (min max) + (sqlite:execute-one-row-m-v + *db* (sxql:yield (sxql:select ((:min :year) (:max :year)) (sxql:from :words)))) + (psetq *min-year* min + *max-year* max))) + +(handler-case (update-year-range) + ;; If the database doesn't exist, this will throw an error. Ignore it for now. + (sqlite:sqlite-error () nil)) + +(defun skip-non-word-chars (input-stream) + "Read from INPUT-STREAM up to, but not including, the next word character. +Always returns NIL. No effect at end of file." + (handler-case + (do ((c (read-char input-stream) (read-char input-stream))) + ((alpha-char-p c) (unread-char c input-stream))) + (end-of-file () nil))) + +(defun read-word (input-stream) + "Read and return a run of word characters from INPUT-STREAM. +Swallows any leading non-word characters and one trailing non-word character +from INPUT-STREAM. If INPUT-STREAM is at end-of-stream, return NIL." + (let ((word (make-array 0 :element-type 'character :adjustable t))) + (do ((c (read-char input-stream nil nil) (read-char input-stream nil nil))) + (nil) ; loop until (return) + (cond + ;; got a word char: accumulate into word + ((and (characterp c) (alpha-char-p c)) + (vector-push-extend c word)) + ;; got a non-word char or EOF but we have an accumulated word: return it + ((> (array-total-size word) 0) + (return word)) + ;; got EOF but no word accumulated: return nil + ((not (characterp c)) + (return nil)) + ;; got a non-word char but haven't seen any word chars yet: carry on + (t nil))))) + +(defun parse-xml (xml-file-pathname) + "Parse the debate's year and full text from the file at XML-FILE-PATHNAME." + (let ((doc (fxml:parse xml-file-pathname (fxml-dom:make-dom-builder) + :root "DOKUMENT" :forbid-external nil))) + (values (parse-integer (xpath:string-value (xpath:evaluate "//DATUM" doc)) + :start 6 :end 10) + (xpath:string-value (xpath:evaluate "//TEXT" doc))))) + +(defun count-words (text &optional (normalize (lambda (word) word))) + "Return a hash-table of strings (words) to their counts in TEXT." + (let ((stream (make-string-input-stream text)) + (counts (make-hash-table :test #'equal :size 1000))) + (do ((word (read-word stream) (read-word stream))) + ((null word) counts) + (incf (gethash (funcall normalize word) counts 0))))) + +(defmacro db-operation (operation-func &body sxql-statements) + "Convert SXQL-STATEMENTS to SQL and pass that OPERATION-FUNC." + (let ((cur-stmt (gensym)) + (cur-args (gensym)) + (sxql-stmt (gensym))) + `(sqlite:with-transaction *db* + (dolist (,sxql-stmt (list . ,sxql-statements)) + (multiple-value-bind (,cur-stmt ,cur-args) (sxql:yield ,sxql-stmt) + (apply #',operation-func *db* ,cur-stmt ,cur-args)))))) + +(defun create-word-database () + "Create a blank database for word counts in *DB*." + (db-operation sqlite:execute-non-query + (sxql:drop-table :years :if-exists t) + (sxql:drop-table :words :if-exists t) + (sxql:create-table :years + ((year :type 'integer :not-null t :primary-key t) + (total-words :type 'integer))) + (sxql:create-table :words + ((year :type 'integer :not-null t) + (word :type 'string :not-null t) + (count :type 'integer :not-null t) + (count-per-100k :type 'float)) + (sxql:primary-key '(:year :word))))) + +(defun make-insert-query (year word-counts) + "Create a SQLite query to insert WORD-COUNTS into *DB*, all in YEAR. +Return two values: the query string and a list of parameters." + (let* (query-args + (query-start + (string-right-trim + "," (with-output-to-string (stream) + (princ "INSERT INTO \"words\" (\"year\", \"word\", \"count\") VALUES" stream) + (flet ((extend-query (word count) + (setq query-args (list* year word count query-args)) + (princ "(?,?,?)," stream))) + (maphash #'extend-query word-counts))))) + (query-end " ON CONFLICT (\"year\", \"word\") DO UPDATE SET \"count\" = \"count\" + \"excluded\".\"count\"")) + (values (concatenate 'string query-start query-end) query-args))) + +(defun count-all-files-into-db () + "Loop through XML files, counting words and writing them to *DB*." + (block count + (let ((xmlfiles (directory (make-pathname :name :wild :type "xml" :defaults + (truename (make-pathname :directory '(:relative "xml")))))) + (num-scanned 0)) + (gui-notify-start-scan (length xmlfiles)) + (dolist (xmlfn xmlfiles) + (multiple-value-bind (year text) (parse-xml xmlfn) + ;; Don't use sqxl here because doing everything in a single query is + ;; much more efficient here (143.1 sec/file --> 0.2 sec/file)! + (multiple-value-bind (query args) + ;; NOTE: [n]string-*case don't change ß and ẞ, but handle äöü fine. + (make-insert-query year (count-words text #'nstring-downcase)) + (apply #'sqlite:execute-non-query *db* query args)) + + ;; We're in another thread, the main window might be closed out from under us. + (when ltk:*exit-mainloop* + (return-from count)) + (gui-notify-files-scanned (incf num-scanned) year)) + (bt:thread-yield))) + (gui-notify-end-scan))) + +(defun update-global-db-counts () + "Update yearly total word counts and normalised word counts in *DB*." + (db-operation sqlite:execute-non-query + ;; Recalculate total word counts for each year. + (sxql:delete-from :years) + (sxql:insert-into :years (:year :total-words) + (sxql:select (:year (:as (:sum :count) :total-words)) + (sxql:from :words) + (sxql:group-by :year))) + ;; Update normalised word counts -- we can do this now that we have yearly totals. + (sxql:update :words + (sxql:set= :count-per-100k + (:/ :count 1e-5 (sxql:fields (sxql:select :years.total-words + (sxql:from :years) + (sxql:where (:= :years.year :words.year))))))))) + +(defun recreate-database () + "Count words in XML files, overwriting database." + (create-word-database) + (count-all-files-into-db) + (update-global-db-counts) + (update-year-range)) + + +;;; Querying + +(defparameter *whitespace* '(#\space #\tab #\newline #\return #\page) + "Whitespace characters to be trimmed from entered terms.") + +(defun split-by-character (string separator) + "Split STRING into components separated by the single character SEPARATOR." + ;; http://cl-cookbook.sourceforge.net/strings.html#reverse + (loop for i = 0 then (1+ j) + as j = (position separator string :start i) + collect (subseq string i j) + while j)) + +(defun nvector-sum (vector1 &rest vectors) + "Add VECTORS to VECTOR1 element-wise, altering and returning VECTOR1. +All VECTORS should have the same `length' as VECTOR1." + (dolist (vector vectors vector1) + (dotimes (i (min (length vector1) (length vector))) + (incf (aref vector1 i) (aref vector i))))) + +(defun vector-sum (vector1 &rest vectors) + "Add VECTORS to VECTOR1 element-wise, creating and returning a new vector. +The returned vector will have the same `array-element-type' and `length' as +VECTOR1. All VECTORS should have the same `length' as VECTOR1." + (apply #'nvector-sum + (make-array (length vector1) :element-type (array-element-type vector1) :initial-contents vector1) + vectors)) + +(defmacro apply-partially (function &rest initial-args) + "Call FUNCTION with INITIAL-ARGS and args passed when the result of this macro is called. +((apply-partially FUNCTION INITIAL ARGS ...) LAMBDA ARGS ...) +== (FUNCTION INITIAL ARGS ... LAMBDA ARGS ...)" + (let ((other-args (gensym))) + `(lambda (&rest ,other-args) + (apply ,function ,@initial-args ,other-args)))) + +(defun parse-terms-entry (query) + "Parse QUERY, separating into groups of terms on commas and terms on pluses. +*WHITESPACE* characters are trimmed from terms. Empty terms and groups are ignored. +Example: (parse-terms-entry \"a + B, ,c,\") => ((\"a\" \"b\") (\"c\"))" + (mapcar + (apply-partially #'mapcar #'nstring-downcase) + (flet ((split-into-terms (group) + (delete-if (lambda (term) (zerop (length term))) + (mapcar (apply-partially #'string-trim *whitespace*) + (split-by-character group #\+))))) + (delete-if #'null (mapcar #'split-into-terms (split-by-character query #\,)))))) + +(defun get-plot-data (words &key (start-year *min-year*) (end-year *max-year*)) + "Retrieve normalised word count data for WORDS from *DB*. +Return a hash-table mapping a word to an array with each word's normalized count +in year (+ START-YEAR n) at position n." + (let ((hash (make-hash-table :test #'equal :size (list-length words))) + (stmt (sqlite:prepare-statement + *db* (concatenate + 'string + "SELECT (\"year\" - ?) AS \"year-idx\", \"word\", \"count-per-100k\" FROM \"words\" " + "WHERE \"year\" >= ? AND \"year\" <= ? AND \"word\" IN (" + (string-right-trim "," (with-output-to-string (stream) + (dotimes (i (list-length words)) + (princ "?," stream)))) + ")")))) + + ;; Prepopulate the result hash-table with zeros. + (let ((number-of-years (1+ (- end-year start-year)))) + (dolist (word words) + (setf (gethash word hash) + (make-array number-of-years :element-type 'double-float :initial-element 0d0)))) + + ;; Bind prepared statement parameters. + (let ((i 0)) + (dolist (param (list* start-year start-year end-year words)) + ;; Parameter indices start at 1, for some reason. + (sqlite:bind-parameter stmt (incf i) param))) + + (do ((have-another? (sqlite:step-statement stmt) (sqlite:step-statement stmt))) + ((not have-another?) + ;; Return the completed hash-table. + (prog1 hash (sqlite:finalize-statement stmt))) + ;; Enter query results into the hash-table. + (let ((year-idx (sqlite:statement-column-value stmt 0)) + (word (sqlite:statement-column-value stmt 1)) + (count (sqlite:statement-column-value stmt 2))) + (unless (null count) + (incf (aref (gethash word hash) year-idx) count)))))) + + +;;; Plotting + +(defparameter *plot-colors* #("red" "green" "blue" "magenta" "cyan" "yellow") + "Colors to use to draw plots for each group of words, in order.") +(defparameter *plot-x-label* "Jahr" + "Plot's horizontal axis label.") +(defparameter *plot-y-label* "Häufigkeit / 100.000" + "Plot's vertical axis label.") +(defparameter *terms-entry* nil + "Text entry widget for search terms.") +(defparameter *plot* nil + "Canvas widget to contain plots.") + +(defun y-axis-ticks (axis-min axis-max) + "Return tick values for Y axis values between AXIS-MIN and AXIS-MAX. +The axis range can be obtained from the minimum (first) and maximum (last) ticks." + (let ((tick-increment (expt 10 (ffloor (log (max (abs axis-max) (abs axis-min)) 10)))) + ticks) + (do ((tick-value (- axis-max (nth-value 1 (fceiling axis-max tick-increment))) + (- tick-value tick-increment))) + ((< tick-value axis-min) + (cons tick-value ticks)) + (push tick-value ticks)))) + +(defmacro with-canvas-items (canvas names itemspecs &body forms) + "Draw ITEMSPECS on CANVAS, bind the items to NAMES and run FORMS. +NAMES should be a list of symbols suitable for passing to `destructuring-bind'. +FORMS may refer to individual NAMES. For documentation on ITEMSPECS, see +`ltk::create-item-command'." + `(destructuring-bind ,names (ltk:make-items ,canvas ,itemspecs) + . ,forms)) + +(defun plot-draw-axes (canvas x-min x-max y-min y-max) + "Draw plot axes on CANVAS, returning the bbox where plot lines should be drawn. +The return value is a list in (LEFT TOP RIGHT BOTTOM) format; all values in +pixels, measured from the top-left canvas edge." + (let ((w (ltk:window-width canvas)) + (h (ltk:window-height canvas)) + (outer-pad 10) ; padding around whole plot + (inner-pad 2)) ; padding between plot components + ;; Draw axis labels first. + (with-canvas-items canvas (x-label y-label) + `((:text ,(/ w 2) ,(- h outer-pad) ,*plot-x-label* :anchor :s :justify :center) + ;; :anchor is relative to the item; :anchor :n :angle 90 == :anchor :w :angle 0 + (:text ,outer-pad ,(/ h 2) ,*plot-y-label* :anchor :n :justify :center :angle 90)) + (let* ((above-x-label (- (cadr (ltk:bbox x-label)) inner-pad)) + (right-of-y-label (+ (caddr (ltk:bbox y-label)) inner-pad)) + ;; Assume x value labels will be as tall as x axis label (XXX: is this valid?) + (text-height (- (cadddr (ltk:bbox x-label)) (cadr (ltk:bbox x-label)))) + (above-x-axis (- above-x-label inner-pad text-height inner-pad)) + y-axis-labels) + ;; Draw y value labels first, as we know how tall x value labels will be. + ;; TODO + (dolist (tick-value (y-axis-ticks y-min y-max)) + (let ((canvas-y )) + (push `(:text ,right-of-y-label ,canvas-y (write-to-string tick-value) :anchor :w) y-axis-labels))) + ;; Now that we know y value labels' width, draw x value labels in 10-year increments. + ;; TODO + + ;; Draw axis arrows last. + (with-canvas-items canvas (x-axis y-axis) + `((:line ,right-of-y-label ,above-x-label ,(- w outer-pad) ,above-x-label :arrow :last) + (:line ,right-of-y-label ,above-x-label ,right-of-y-label ,outer-pad :arrow :last)) + (let ((above-x-axis (- (cadr (ltk:bbox x-axis)) inner-pad)) + (right-of-y-axis (+ (caddr (ltk:bbox y-axis)) inner-pad))) + ;; Return drawable plot area. + (list right-of-y-axis outer-pad (- w outer-pad) above-x-axis))))))) + +(defun plot-line-coords (canvas draw-area y-min y-max raw-y-data color) + "Draw a line with COLOR inside DRAW-AREA on CANVAS, plotting RAW-Y-DATA between Y-MIN and Y-MAX." + (unless (= y-min y-max) + (destructuring-bind (draw-left draw-top draw-right draw-bottom) draw-area + (let* ((plot-area-w (- draw-right draw-left)) + (plot-area-h (- draw-bottom draw-top)) + (number-of-years (- *max-year* *min-year*)) + (coords (make-array (* 2 (1+ number-of-years)) + :element-type 'float :fill-pointer 0 :initial-element 0f0))) + (dotimes (year-idx (length raw-y-data)) + (vector-push (+ draw-left (* plot-area-w year-idx (/ (max 1 number-of-years)))) coords) + (vector-push (- draw-bottom (* plot-area-h (- (aref raw-y-data year-idx) y-min) + (/ (- y-max y-min)))) + coords)) + ;; If fewer than two years are plotted, lines don't have enough coords. + (do ((extra-year-idx (length raw-y-data) (1+ extra-year-idx))) + ((>= (length coords) 4)) + (vector-push-extend (+ draw-left (* plot-area-w extra-year-idx + (/ (max 1 number-of-years)))) + coords) + (vector-push-extend draw-bottom coords)) + (ltk:configure + (ltk:make-line canvas coords) + :width 1 :fill color :activefill color :disabledfill color))))) + +(defun on-entry-update () + "Update the graph when the search text field is changed." + (let* ((word-groups (parse-terms-entry (ltk:text *terms-entry*))) + (words-data (get-plot-data (apply #'concatenate 'list word-groups))) + (groups-data (mapcar (lambda (group) + (apply #'vector-sum + (mapcar (lambda (word) + (gethash word words-data)) + group))) + word-groups)) + (plot-min (reduce #'min groups-data :key (apply-partially #'reduce #'min) + :initial-value 0)) + (plot-max (reduce #'max groups-data :key (apply-partially #'reduce #'max) + :initial-value 1/1000))) + (ltk:clear *plot*) + (let ((draw-area (plot-draw-axes *plot* *min-year* *max-year* plot-min plot-max))) + (dotimes (i (length groups-data)) + (plot-line-coords *plot* draw-area plot-min plot-max (pop groups-data) + (aref *plot-colors* (mod i (length *plot-colors*)))))))) + + +;;; LTk GUI + +(defparameter *scan-progress* nil + "The progress bar widget showing scan progress.") +(defparameter *scan-label* nil + "A small label showing scan progress.") + +(defun gui-notify-start-scan (num-files) + "Show scan progress bar with a maximum value of NUM-FILES." + (when *scan-progress* + (ltk:configure *scan-progress* :mode :determinate :value 0 :maximum num-files) + ;; Use remembered grid settings. + (ltk:format-wish "grid ~a" (ltk:widget-path *scan-progress*))) + (when *scan-label* + (setf (ltk:text *scan-label*) "Scan (0):") + ;; Use remembered grid settings. + (ltk:format-wish "grid ~a" (ltk:widget-path *scan-label*)))) + +(defun gui-notify-files-scanned (scanned cur-year) + "Notify the interface of an updated number of files scanned." + (when *scan-progress* + (ltk:configure *scan-progress* :value scanned)) + (when *scan-label* + (setf (ltk:text *scan-label*) (with-output-to-string (s) + (format s "Scan (~a):" cur-year))))) + +(defun gui-notify-end-scan () + "Hide scan progress bar." + (when *scan-progress* + (ltk:configure *scan-progress* :mode :indeterminate) + ;; This remembers grid settings for later use in `gui-notify-start-scan'. + (ltk:format-wish "grid remove ~a" (ltk:widget-path *scan-progress*))) + (when *scan-label* + (setf (ltk:text *scan-label*) "(kein Scan)") + ;; This remembers grid settings for later use in `gui-notify-start-scan'. + (ltk:format-wish "grid remove ~a" (ltk:widget-path *scan-label*)))) + +(defmacro grid (&body forms) + "Rewrite FORMS into a series of `ltk:grid' calls. +FORMS is a list of the format (R C WIDGET . EXTRA), rewritten into calls of the +form (ltk:grid ,WIDGET ,R ,C ,@defaults ,@EXTRA) for some sensible defaults." + (let (out) + (dolist (form (reverse forms) `(progn . ,out)) + (destructuring-bind (row column widget &rest extra) form + (push `(ltk:grid ,widget ,row ,column :sticky :nsew :padx 5 :pady 5 . ,extra) out))))) + +(defmacro make-thread-with-saved (vars &body thread-body) + "Call `bt:make-thread', but saving VARS from reassignment inside the thread." + (let ((tmps (mapcar (lambda (_) (declare (ignore _)) (gensym)) vars))) + `(let ,(mapcar #'list tmps vars) + (bt:make-thread (lambda () + (let ,(mapcar #'list vars tmps) + . ,thread-body)))))) + +(defun gui-main () + "Build and show GUI." + (ltk:with-ltk (:debug :develop) + (let ((top (make-instance 'ltk:frame))) + (psetq *terms-entry* (make-instance 'ltk:entry :master top :validate :key) + *plot* (make-instance 'ltk:canvas :master top :background "white") + *scan-progress* (make-instance 'ltk:progressbar :master top :mode :indeterminate) + *scan-label* (make-instance 'ltk:label :master top :text "Scan:")) + + (ltk:pack top :fill :both :expand t) + (ltk:grid-columnconfigure top 1 :weight 1) + (ltk:grid-rowconfigure top 3 :weight 1) + + ;; The default LTk handler for :validatecommand uses invalid syntax + ;; (missing the "return"). + (ltk:format-wish + "~a configure -validatecommand {callback ~a; return 1}" + (ltk:widget-path *terms-entry*) + (let ((name (ltk::name *terms-entry*))) + (ltk::add-callback (ltk::name *terms-entry*) #'on-entry-update) + name)) + + (let ((toolbar (make-instance 'ltk:frame :master top)) + bg-thread) + (flet ((close-toplevel () + (flet ((exit-main () (setq ltk:*exit-mainloop* t))) + (when (and (bt:threadp bg-thread) (bt:thread-alive-p bg-thread)) + ;; Notify background thread that we're closing. + (bt:interrupt-thread bg-thread #'exit-main)) + (exit-main))) + (mkdb-in-thread () + (unless (and (bt:threadp bg-thread) (bt:thread-alive-p bg-thread)) + (setq bg-thread (make-thread-with-saved (ltk:*wish* ltk:*exit-mainloop*) + (recreate-database)))))) + + (grid + (0 0 toolbar :columnspan 2) + (0 0 (make-instance 'ltk:button + :master toolbar :text "Schließen" + :command #'close-toplevel)) + (0 1 (make-instance 'ltk:button + :master toolbar :text "Daten neu einlesen" + :command #'mkdb-in-thread)) + (2 0 (make-instance 'ltk:label :master top :text "Suche:")) + (2 1 *terms-entry*) + (3 0 *plot* :columnspan 2) + (1 0 *scan-label*) + (1 1 *scan-progress*))))) + + ;; Plotting functions need non-`nil' `*min-year*' and `*max-year*'. + (when (or (null *min-year*) (null *max-year*)) + (ltk:configure *terms-entry* :state :disabled)) + + ;; Hide scan progress bar until a scan is started. + (gui-notify-end-scan))) + +(gui-main) |
