;;; UBF.SCM ;;; Copyright (C) Tony Garnock-Jones 2002, 2003, 2004 ;;; An implementation of Joe Armstrong's UBF (Universal Binary Format) ;;; ;;; Copyright (c) 2000-2004, 2007 Tony Garnock-Jones ;;; ;;; Permission is hereby granted, free of charge, to any person obtaining a copy ;;; of this software and associated documentation files (the "Software"), to deal ;;; in the Software without restriction, including without limitation the rights ;;; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ;;; copies of the Software, and to permit persons to whom the Software is ;;; furnished to do so, subject to the following conditions: ;;; ;;; The above copyright notice and this permission notice shall be included in ;;; all copies or substantial portions of the Software. ;;; ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ;;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ;;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ;;; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ;;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ;;; THE SOFTWARE. ;; ;; Fairly portable Scheme code. Requires R5RS and: ;; ;; - SRFIs 1, 4, 9 and 13. ;; - 1 and 13 will be hard to live without ;; - you could get by without 9 if you don't mind defining structs yourself ;; - 4 is only used for u8vector. Any other binary-capable container will suffice ;; ;; - a "sort" procedure conforming to (sort less? list) ;; ;; - an "error" procedure ;; ;; - a hash-table implementation similar to the one in PLT scheme ;; that compares based on eq?: ;; (make-hash-table) ;; (hash-table-put! ht key value) ;; (hash-table-get ht key default-thunk) ;; (hash-table-for-each ht binary-function) ;; ;; Provides: ;; ;; - (make-tagged-ubf tag value) => tagged-ubf ;; - (tagged-ubf? x) => boolean ;; - (tagged-ubf-tag tagged-ubf) => any ;; - (tagged-ubf-tag-set! tagged-ubf any) ;; - (tagged-ubf-value tagged-ubf) => any ;; - (tagged-ubf-value-set! tagged-ubf any) ;; ;; - ubf-a-reserved-chars ;; ;; - (decode-ubf-a char-provider-thunk) => object ;; - (basic-encode-ubf-a char-emitter-procedure object . maybe-sorted-register-candidates) ;; - (encode-ubf-a char-emitter-procedure object) ;; ;; To encode to/decode from strings, use string ports (SRFI 6). ;; (define-record-type tagged-ubf (make-tagged-ubf tag value) tagged-ubf? (tag tagged-ubf-tag tagged-ubf-tag-set!) (value tagged-ubf-value tagged-ubf-value-set!)) (define ubf-a-reserved-chars (list #\% #\" #\' #\` #\~ #\# #\& #\$ #\> #\- #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\{ #\} #\, #\space #\newline #\return #\tab)) (define decode-ubf-a (let ((digit->number (let ((zero (char->integer #\0))) (lambda (ch) (- (char->integer ch) zero))))) (lambda (char-generator) (let ((stack '()) (dispatch-table (make-vector 256 #f))) (define (bind-char-handler! ch handler) (let ((i (char->integer ch))) (if (not (and (positive? i) (< i 256))) (error "Attempt to bind out-of-range char-handler in decode-ubf-a" ch handler)) (vector-set! dispatch-table i handler))) (define (bind-char-handlers! chars handler) (for-each (lambda (ch) (bind-char-handler! ch handler)) chars)) (define (push! x) (set! stack (cons x stack))) (define (pop!) (let ((x (car stack))) (set! stack (cdr stack)) x)) (define (exch! x) (let ((v (car stack))) (set-car! stack x) v)) (define (peek) (car stack)) (define (empty?) (null? stack)) (define (quoting-collector keep-delimiters stop handler) (lambda (start) (let loop ((acc (if keep-delimiters (list start) '()))) (let ((char (char-generator))) (cond ((eof-object? char) char) ((char=? char #\\) (let ((ch2 (char-generator))) (cond ((eof-object? ch2) ch2) ((or (char=? ch2 #\\) (char=? ch2 stop)) (loop (cons ch2 acc))) (else (error "Unsupported quoted character" ch2 stop))))) ((char=? char stop) (handler (if keep-delimiters (cons char acc) acc))) (else (loop (cons char acc)))))))) (define (next-char) (let ((char (char-generator))) (if (eof-object? char) char (next-char1 char)))) (define (next-char1 char) (let ((i (char->integer char))) (if (not (and (positive? i) (< i 256))) (error "Character out-of-range in next-char1 in decode-ubf-a" char)) (let ((handler (vector-ref dispatch-table i))) (if handler (handler char) (error "Unhandled UBF-A character" char))))) (define (ignore x) (next-char)) (define (handle-string x) (push! (list->string (reverse! x))) (next-char)) (define (handle-symbol x) (push! (string->symbol (list->string (reverse! x)))) (next-char)) (define (handle-semantic-tag xlist) (let ((x (string->symbol (list->string (reverse! xlist))))) (if (empty?) (error "Semantic tag must follow item" x) (let ((value (peek))) (exch! (make-tagged-ubf x value)) (next-char))))) (define (handle-binary first-tilde) (if (or (empty?) (not (number? (peek)))) (error "Binary data must be preceded by length")) (let* ((binlen (peek)) (bv (make-u8vector binlen))) (let loop ((i 0) (ch (char-generator))) (cond ((eof-object? ch) ch) ((= i binlen) (if (char=? ch #\~) (begin (exch! bv) (next-char)) (error "Binary data must be followed by tilde"))) (else (u8vector-set! bv i (char->integer ch)) (loop (+ i 1) (char-generator))))))) (define (handle-null ch) (push! '()) (next-char)) (define (handle-cons ch) (let* ((a (pop!)) (d (peek))) (exch! (cons a d)) (next-char))) (define (handle-eom ch) (if (empty?) (error "Empty stack at end of message") (let ((value (pop!))) (if (empty?) value (error "Rubbish remains on stack at UBF EOM token" stack))))) (define (handle-bind dummy) (let ((ch (char-generator))) (cond ((eof-object? ch) ch) ((memv ch ubf-a-reserved-chars) (error "Attempt to bind to reserved character" ch)) (else (let ((value (pop!))) (bind-char-handler! ch (lambda (ch) (push! value) (next-char))) (next-char)))))) (define (collect-int ch) (let loop ((sign (if (char=? ch #\-) -1 1)) (acc (if (char=? ch #\-) 0 (digit->number ch)))) (let ((char (char-generator))) (cond ((eof-object? char) char) ((and (char>=? char #\0) (char<=? char #\9)) (loop sign (+ (* acc 10) (digit->number char)))) (else (push! (* sign acc)) (next-char1 char)))))) (define (handle-open-struct ch) (push! #\{) (next-char)) (define (handle-close-struct ch) (let loop ((acc '())) (let ((v (pop!))) (if (eqv? v #\{) (begin (push! (list->vector acc)) (next-char)) (loop (cons v acc)))))) (bind-char-handler! #\% (quoting-collector #f #\% ignore)) (bind-char-handler! #\" (quoting-collector #f #\" handle-string)) (bind-char-handler! #\' (quoting-collector #f #\' handle-symbol)) (bind-char-handler! #\` (quoting-collector #f #\` handle-semantic-tag)) (bind-char-handler! #\~ handle-binary) (bind-char-handler! #\# handle-null) (bind-char-handler! #\& handle-cons) (bind-char-handler! #\$ handle-eom) (bind-char-handler! #\> handle-bind) (bind-char-handlers! (string->list "-0123456789") collect-int) (bind-char-handler! #\{ handle-open-struct) (bind-char-handler! #\} handle-close-struct) (bind-char-handlers! (list #\, #\space #\newline #\return #\tab) ignore) (next-char))))) (define basic-encode-ubf-a (let ((regpref (append (string->list "abcdefghijklmnopqrstuvwxyz") (string->list "ABCDEFGHIJKLMNOPQRSTUVWXYZ") (string->list ".,:;[]\\|+=_()*^@!")))) (set! regpref (append regpref (lset-difference char=? (map integer->char (iota 256)) regpref ubf-a-reserved-chars))) (lambda (char-emitter term . maybe-sorted-register-candidates) (let ((registers (make-hash-table)) (sorted-register-candidates (if (pair? maybe-sorted-register-candidates) (car maybe-sorted-register-candidates) '())) (wrote-integer #f)) (define (quote-string q str) (char-emitter q) (string-for-each (lambda (ch) (if (or (char=? ch #\\) (char=? ch q)) (char-emitter #\\)) (char-emitter ch)) str) (char-emitter q)) (do ((terms sorted-register-candidates (cdr terms)) (chars regpref (cdr chars))) ((or (null? terms) (null? chars))) (hash-table-put! registers (car terms) (cons #f (car chars)))) (let walk ((term term)) (let ((entry (hash-table-get registers term (lambda () #f))) (new-wrote-integer #f)) (if (and entry (car entry)) (char-emitter (cdr entry)) (begin (cond ((and (integer? term) (exact? term)) (if (and wrote-integer (not (negative? term))) (char-emitter #\space)) (string-for-each char-emitter (number->string term)) (set! new-wrote-integer #t)) ((string? term) (quote-string #\" term)) ((symbol? term) (quote-string #\' (symbol->string term))) ((u8vector? term) (walk (u8vector-length term)) (char-emitter #\~) (for-each char-emitter (map integer->char (u8vector->list term))) (char-emitter #\~)) ((vector? term) (char-emitter #\{) (for-each walk (vector->list term)) (char-emitter #\})) ((null? term) (char-emitter #\#)) ((pair? term) (walk (cdr term)) (walk (car term)) (char-emitter #\&)) ((tagged-ubf? term) (walk (tagged-ubf-value term)) (quote-string #\` (symbol->string (tagged-ubf-tag term)))) (else (error "Unsupported term type in basic-encode-ubf-a" term))) (if entry (begin (set-car! entry #t) (char-emitter #\>) (char-emitter (cdr entry)) (char-emitter (cdr entry)))))) (set! wrote-integer new-wrote-integer))) (char-emitter #\$))))) (define (encode-ubf-a char-emitter term) (let ((terms (make-hash-table)) (sorted-terms '())) ;; Need to bias the sorted term list not just to the most ;; frequently-occurring objects, but also to the larger objects: ;; sort by (size * count) instead of just count. (let walk ((term term)) (let ((entry (hash-table-get terms term (lambda () #f)))) (if entry (hash-table-put! terms term (+ entry 1)) (begin (hash-table-put! terms term 1) (cond ((vector? term) (for-each walk (vector->list term))) ((pair? term) (walk (car term)) (walk (cdr term))) ((tagged-ubf? term) (walk (tagged-ubf-value term))) (else 'its-a-non-compound-datum)))))) (hash-table-for-each terms (lambda (term count) ;; Don't bother caching any terms that only occur once, or that ;; have encoded representations a single byte long already. (if (and (> count 1) (not (or (null? term) (and (integer? term) (<= 0 term 9))))) (set! sorted-terms (cons (cons count term) sorted-terms))))) ;; Sort the list, putting the most-commonly-occurring term first. (set! sorted-terms (sort (lambda (a b) (> (car a) (car b))) sorted-terms)) (basic-encode-ubf-a char-emitter term (map cdr sorted-terms))))