;;; Char->char partial maps -*- Scheme -*- ;;; Copyright (C) 1998 by Olin Shivers. ;;; CCPs are an efficient data structure for doing simple string transforms, ;;; similar to the kinds of things you would do with the tr(1) program. ;;; ;;; This code is tuned for a 7- or 8-bit character type. Large, 16-bit ;;; character types would need a more sophisticated data structure, tuned ;;; for sparseness. I would suggest something like this: ;;; (define-record ccp ;;; domain ; The domain char-set ;;; map ; Sorted vector of (char . string) pairs ;;; ; specifying the map. ;;; id?) ; If true, mappings not specified by MAP are ;;; ; identity mapping. If false, MAP must ;;; ; specify a mapping for every char in DOMAIN. ;;; ;;; A (char . string) elements in MAP specifies a mapping for the contiguous ;;; sequence of L chars beginning with CHAR (in the sequence of the underlying ;;; char type representation), where L is the length of STRING. These MAP elements ;;; are sorted by CHAR, so that binary search can be used to get from an input ;;; character C to the right MAP element quickly. ;;; ;;; This representation should be reasonably compact for standard mappings on, ;;; say, a Unicode CCP. An implementation might wish to have a cache field ;;; in the record for storing the full 8kb bitset when performing ccp-map ;;; operations. Or, an implementation might want to store the Latin-1 subset ;;; of the map in a dense format, and keep the remainder in a sparse format. (define num-chars (char-set-size char-set:full)) ; AKA 256. (define-record ccp domain ; The domain char-set dshared? ; Is the domain value shared or linear? map ; 256-elt string mshared?) ; Is the map string shared or linear? ;;; Accessors and setters that manage the linear bookkeeping ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (ccp-domain ccp) (set-ccp:dshared? ccp #t) (ccp:domain ccp)) ;;; CCP is a linear ccp. PROC is a domain->domain function; it must be ;;; linear in its parameter and result. ;;; ;;; Updates the domain of the CCP with PROC, returns the resulting ;;; CCP; reuses the old one to construct the new one. (define (restrict-linear-ccp-domain ccp proc) (let ((new-d (proc (if (ccp:dshared? ccp) (begin (set-ccp:dshared? ccp #f) (char-set-copy (ccp:domain ccp))) (ccp:domain ccp))))) (set-ccp:domain ccp new-d) ccp)) ;;; CCP is a linear CCP. PROC is a domain x cmap -> domain function. ;;; It is passed a linear domain and cmap string. It may side-effect ;;; the cmap string, and returns the resulting updated domain. ;;; We return the resulting CCP, reusing the parameter to construct it. (define (linear-update-ccp ccp proc) (let* ((cmap (if (ccp:mshared? ccp) (begin (set-ccp:mshared? ccp #f) (string-copy (ccp:map ccp))) (ccp:map ccp))) (new-d (proc (if (ccp:dshared? ccp) (begin (set-ccp:dshared? ccp #f) (char-set-copy (ccp:domain ccp))) (ccp:domain ccp)) cmap))) (set-ccp:domain ccp new-d) ccp)) ;;; Return CCP's map field, and mark it as shared. CCP functions that ;;; restrict a ccp's domain share map strings, so they use this guy. (define (ccp:map/shared ccp) (set-ccp:mshared? ccp #t) (ccp:map ccp)) (define (ccp-copy ccp) (make-ccp (char-set-copy (ccp:domain ccp)) #f (string-copy (ccp:map ccp)) #f)) ;;; N-ary equality relation for partial maps (define (ccp= ccp1 . rest) (let ((domain (ccp:domain ccp1)) (cmap (ccp:map ccp1))) (every (lambda (ccp2) (and (char-set= domain (ccp:domain ccp2)) (let ((cmap2 (ccp:map ccp2))) (char-set-every? (lambda (c) (let ((i (char->ascii c))) (char=? (string-ref cmap i) (string-ref cmap2 i)))) domain)))) rest))) ;;; N-ary subset relation for partial maps (define (ccp<= ccp1 . rest) (let lp ((domain1 (ccp:domain ccp1)) (cmap1 (ccp:map ccp1)) (rest rest)) (or (not (pair? rest)) (let* ((ccp2 (car rest)) (domain2 (ccp:domain ccp2)) (cmap2 (ccp:map ccp2)) (rest (cdr rest))) (and (char-set<= domain1 domain2) (let ((cmap2 (ccp:map ccp2))) (char-set-every? (lambda (c) (let ((i (char->ascii c))) (char=? (string-ref cmap1 i) (string-ref cmap2 i)))) domain1)) (lp domain2 cmap2 rest)))))) ;;; CCP iterators ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (ccp-fold kons knil ccp) (let ((cmap (ccp:map ccp))) (char-set-fold (lambda (c v) (kons c (string-ref cmap (char->ascii c)) v)) knil (ccp:domain ccp)))) (define (ccp-for-each proc ccp) (let ((cmap (ccp:map ccp))) (char-set-for-each (lambda (c) (proc c (string-ref cmap (char->ascii c)))) (ccp:domain ccp)))) (define (ccp->alist ccp) (ccp-fold (lambda (from to alist) (cons (cons from to) alist)) '() ccp)) ;;; CCP-RESTRICT ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Restrict a ccp's domain. (define (ccp-restrict ccp cset) (make-ccp (char-set-intersection cset (ccp:domain ccp)) #f (ccp:map/shared ccp) #t)) (define (ccp-restrict! ccp cset) (restrict-linear-ccp-domain ccp (lambda (d) (char-set-intersection! d cset)))) ;;; CCP-ADJOIN ccp from-char1 to-char1 ... ;;; CCP-DELETE ccp char1 ... ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Add & delete mappings to/from a ccp. (define (ccp-delete ccp . chars) (make-ccp (apply char-set-delete (ccp:domain ccp) chars) #f (ccp:map/shared ccp) #t)) (define (ccp-delete! ccp . chars) (restrict-linear-ccp-domain ccp (lambda (d) (apply char-set-delete! d chars)))) (define (ccp-adjoin ccp . chars) (let ((cmap (string-copy (ccp:map ccp)))) (make-ccp (install-ccp-adjoin! cmap (char-set-copy (ccp:domain ccp)) chars) #f cmap #f))) (define (ccp-adjoin! ccp . chars) (linear-update-ccp ccp (lambda (d cmap) (install-ccp-adjoin! cmap d chars)))) (define (install-ccp-adjoin! cmap domain chars) (let lp ((chars chars) (d domain)) (if (pair? chars) (let ((from (car chars)) (to (cadr chars)) (chars (cddr chars))) (string-set! cmap (char->ascii from) to) (lp chars (char-set-adjoin! d from))) d))) ;;; CCP-EXTEND ccp1 ... ;;; CCP-EXTEND! ccp1 ccp2 ... ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Extend ccp1 with ccp2, etc. (define (ccp-extend . ccps) (if (pair? ccps) (let ((ccp0 (car ccps)) (ccps (cdr ccps))) (if (pair? ccps) (let ((cmap (string-copy (ccp:map ccp0)))) ; Copy cmap. ;; The FOLD installs each ccp in CCPS into CMAP and produces ;; the new domain. (make-ccp (fold (lambda (ccp d) (install-ccp-extension! cmap d ccp)) (char-set-copy (ccp:domain ccp0)) ccps) #f cmap #f)) ccp0)) ; Only 1 parameter ccp:0)) ; 0 parameters (define (ccp-extend! ccp0 . ccps) (linear-update-ccp ccp0 (lambda (domain cmap) (fold (lambda (ccp d) (install-ccp-extension! cmap d ccp)) domain ccps)))) ;;; Side-effect CMAP, linear-update and return DOMAIN. (define (install-ccp-extension! cmap domain ccp) (let ((cmap1 (ccp:map ccp)) (domain1 (ccp:domain ccp))) (char-set-for-each (lambda (c) (let ((i (char->ascii c))) (string-set! cmap i (string-ref cmap1 i)))) domain1) (char-set-union! domain domain1))) ;;; Compose the CCPs. 0-ary case: (ccp-compose) = ccp:1. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; For each character C-IN in the original domain, we push it ;;; through the pipeline of CCPs. If we ever land outside the ;;; domain of a ccp, we punt C-IN. If we push it all the way ;;; through, we add C-IN to our result domain, and add the mapping ;;; into the cmap we are assembling. ;;; ;;; Looping this way avoids building up intermediate temporary ;;; CCPs. If CCP's were small bitsets, we might be better off ;;; slicing the double-nested loops the other way around. (define (ccp-compose . ccps) (cond ((not (pair? ccps)) ccp:1) ; 0 args => ccp:1 ((not (pair? (cdr ccps))) (car ccps)) ; 1 arg (else (let* ((v (list->vector ccps)) (vlen-2 (- (vector-length v) 2)) (cmap (make-string num-chars)) (d1 (ccp:domain (vector-ref v (+ vlen-2 1)))) (d (char-set-fold (lambda (c-in d) (let lp ((c c-in) (i vlen-2)) (if (>= i 0) (let ((ccp (vector-ref v i))) (if (char-set-contains? (ccp:domain ccp) c) (lp (string-ref (ccp:map ccp) (char->ascii c)) (- i 1)) ;; Lose: remove c-in from d. (char-set-delete! d c-in))) ;; Win: C-IN -> C (begin (string-set! cmap (char->ascii c-in) c) d)))) (char-set-copy d1) d1))) (make-ccp d #f cmap #f))))) ;;; ALIST->CPP ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (alist->ccp cc-alist . maybe-base-ccp) (let ((base (:optional maybe-base-ccp ccp:0))) (if (pair? cc-alist) (let ((cmap (string-copy (ccp:map base)))) (make-ccp (install-ccp-alist! cmap (char-set-copy (ccp:domain base)) cc-alist) #f cmap #f)) base))) (define (alist->ccp! alist base) (linear-update-ccp base (lambda (d cmap) (install-ccp-alist! cmap d alist)))) ;;; Side-effect CMAP, linear-update and return DOMAIN. (define (install-ccp-alist! cmap domain alist) (fold (lambda (from/to d) (let ((from (car from/to)) (to (cdr from/to))) (string-set! cmap (char->ascii from) to) (char-set-adjoin! domain from))) domain alist)) ;;; PROC->CCP ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; (proc->ccp proc [domain base-ccp]) (define (proc->ccp proc . args) (let-optionals args ((proc-domain char-set:full) (base ccp:0)) (let ((cmap (string-copy (ccp:map base)))) (make-ccp (install-ccp-proc! cmap (char-set-copy (ccp:domain base)) proc proc-domain) #f cmap #f)))) (define (proc->ccp! proc proc-domain base) (linear-update-ccp base (lambda (d cmap) (install-ccp-proc! cmap d proc proc-domain)))) (define (install-ccp-proc! cmap domain proc proc-domain) (char-set-for-each (lambda (c) (string-set! cmap (char->ascii c) (proc c))) proc-domain) (char-set-union! domain proc-domain)) ;;; CONSTANT-CCP ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; (constant-ccp char [domain base-ccp]) ;;; Extend BASE-CCP with the a map taking every char in DOMAIN to CHAR. ;;; DOMAIN defaults to char-set:full. BASE-CCP defaults to CCP:0. (define (constant-ccp char . args) (let-optionals args ((char-domain char-set:full) (base ccp:0)) (let ((cmap (string-copy (ccp:map base)))) (make-ccp (install-constant-ccp! cmap (char-set-copy (ccp:domain base)) char char-domain) #f cmap #f)))) (define (constant-ccp! char char-domain base) (linear-update-ccp base (lambda (d cmap) (install-constant-ccp! cmap d char char-domain)))) ;;; Install the constant mapping into CMAP0 by side-effect, ;;; linear-update & return DOMAIN0 with the constant-mapping's domain. (define (install-constant-ccp! cmap0 domain0 char char-domain) (char-set-for-each (lambda (c) (string-set! cmap0 (char->ascii c) char)) char-domain) (char-set-union! domain0 char-domain)) ;;; CCP/MAPPINGS ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; (ccp/mappings from1 to1 from2 to2 ...) -> ccp ;;; (extend-ccp/mappings base-ccp from1 to1 from2 to2 ...) -> ccp ;;; (extend-ccp/mappings! base-ccp from1 to1 from2 to2 ...) -> ccp ;;; Each FROM element is either a string or a (lo-char . hi-char) range. ;;; Each TO element is either a string or a lo-char. Strings are replicated ;;; to match the length of the corresponding FROM element. ;;; CCP/MAPPINGS's base CCP is CCP:0 ;;; ;;; Tedious code. ;;; Internal utility. ;;; Install the FROM->TO mapping pair into DOMAIN & CMAP by side-effect. ;;; Return the new domain. (define (install-ccp-mapping-pair! cmap domain from to) ;; Tedium -- four possibilities here: ;; str->str, str->lo-char, ;; range->str, range->lo-char. (if (string? from) (if (string? to) ;; "abc" -> "ABC" (let ((len1 (string-length from)) (len2 (string-length to))) (let lp2 ((i (- len1 1)) (j (modulo (- len2 1) len1)) (d domain)) (if (>= i 0) (let ((c (string-ref from i))) (string-set! cmap (char->ascii c) (string-ref to i)) (lp2 (- i 1) (- (if (> j 0) j len2) 1) (char-set-adjoin! d c))) d))) ;; "abc" -> #\A (let lp2 ((i (- (string-length from) 1)) (j (char->ascii to)) (d domain)) (if (>= i 0) (let ((c (string-ref from i))) (string-set! cmap (char->ascii c) (ascii->char j)) (lp2 (- i 1) (- j 1) (char-set-adjoin! d c))) d))) (let ((from-start (char->ascii (car from))) (from-end (char->ascii (cdr from)))) (if (string? to) (let ((len2-1 (- (string-length to) 1))) ;; (#\a . #\c) -> "ABC" (let lp2 ((i from-start) (j 0) (d domain)) (if (<= i from-end) (let ((c (string-ref to j))) (string-set! cmap i c) (lp2 (+ i 1) (if (= j len2-1) 0 (+ j 1)) (char-set-adjoin! d c))) d))) ;; (#\a . #\c) -> #\A (do ((i from-start (+ i 1)) (j (char->ascii to) (+ j 1)) (d domain (begin (string-set! cmap i (ascii->char j)) (char-set-adjoin d (ascii->char i))))) ((> i from-end) d)))))) ;;; Internal utility -- side-effects CMAP; linear-updates & returns DOMAIN. (define (install-mapping-pairs cmap domain args) (let lp ((domain domain) (args args)) (if (pair? args) (lp (install-ccp-mapping-pair! cmap domain (car args) (cadr args)) (cddr args)) domain))) (define (ccp/mappings . args) (let ((cmap (make-string num-chars))) (make-ccp (install-mapping-pairs (make-string num-chars) (char-set-copy char-set:empty) args) #f cmap #f))) (define (extend-ccp/mappings base . args) (let ((cmap (string-copy (ccp:map base)))) (make-ccp (install-mapping-pairs cmap (char-set-copy (ccp:domain base)) args) #f cmap #f))) (define (extend-ccp/mappings! base . args) (linear-update-ccp base (lambda (d cmap) (install-mapping-pairs cmap d args)))) ;;; CONSTRUCT-CCP! ccp elt ... ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; The kitchen-sink constructor; static typing be damned. ;;; ELTS are interpreted as follows: ;;; (lo-char . hi-char) to-string|lo-char ; ccp/range ;;; from-string to-string|lo-char ; ccp/range ;;; ccp ; ccp-extend ;;; alist ; alist->ccp ;;; domain char ; ccp-constant ;;; domain proc ; proc->ccp (define (construct-ccp! ccp . elts) (linear-update-ccp ccp (lambda (d cmap) (install-ccp-construct! cmap d elts)))) (define (construct-ccp base . elts) (let ((cmap (string-copy (ccp:map base)))) (make-ccp (install-ccp-construct! cmap (char-set-copy (ccp:domain base)) elts) #f cmap #f))) ;;; Install the mappings into CMAP by side-effect, ;;; linear-update & return DOMAIN with the final domain. (define (install-ccp-construct! cmap domain elts) (let lp ((d domain) (elts elts)) ;(format #t "d=~s elts=~s\n" d elts) (if (not (pair? elts)) d (let ((elt (car elts)) (elts (cdr elts))) (cond ((pair? elt) (cond ((pair? (car elt)) ; ELT is an alist. (lp (install-ccp-alist! cmap d elt) elts)) ((char? (car elt)) ; ELT is (lo-char . hi-char) range. (lp (install-ccp-mapping-pair! cmap d elt (car elts)) (cdr elts))) (else (error "Illegal elt to construct-ccp" elt)))) ((string? elt) (lp (install-ccp-mapping-pair! cmap d elt (car elts)) (cdr elts))) ((ccp? elt) (lp (install-ccp-extension! cmap d elt) elts)) ((char-set? elt) (let ((elt2 (car elts)) (elts (cdr elts))) (lp (cond ((char? elt2) (install-constant-ccp! cmap d elt2 elt)) ((procedure? elt2) (install-ccp-proc! cmap d elt2 elt)) (else (error "Illegal elt-pair to construct-ccp" elt elt2))) elts))) (else (error "Illegal elt to construct-ccp" elt))))))) ;;; CCP unfold (define (ccp-unfold p f g seed) (let lp ((seed seed) (ccp (ccp-copy ccp:0))) (if (p seed) ccp (lp (g seed) (receive (from to) (f seed) (lp (g seed) (ccp-adjoin! ccp from to))))))) ;;; Using CCPs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; TR ccp string [start end] -> string ;;; CCP-MAP ccp string [start end] -> string ;;; CCP-MAP! ccp string [start end] -> undefined ;;; CCP-APP ccp char -> char or false ;;; If a char in S is not in CCP's domain, it is dropped from the result. ;;; You can use this to map and delete chars from a string. (define (tr ccp s . maybe-start+end) (let-optionals maybe-start+end ((start 0) (end (string-length s))) ;; Count up the chars in S that are in the domain, ;; and allocate the answer string ANS: (let* ((len (- end start)) (domain (ccp:domain ccp)) (ans-len (string-fold (lambda (c numchars) (if (char-set-contains? domain c) (+ numchars 1) numchars)) 0 s start end)) (ans (make-string ans-len))) ;; Apply the map, installing the resulting chars into ANS: (string-fold (lambda (c i) (cond ((ccp-app ccp c) => (lambda (c) (string-set! ans i c) (+ i 1))) (else i))) ; Not in domain -- drop it. 0 s start end) ans))) (define (ccp-map ccp s . maybe-start+end) (apply string-map (lambda (c) (ccp-app ccp c)) s maybe-start+end)) (define (ccp-map! ccp s . maybe-start+end) (apply string-map! (lambda (c) (ccp-app ccp c)) s maybe-start+end)) (define (ccp-app ccp char) (and (char-set-contains? (ccp:domain ccp) char) (string-ref (ccp:map ccp) (char->ascii char)))) ;;; Primitive CCPs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define id-cmap (let ((m (make-string num-chars))) (do ((i (- num-chars 1) (- i 1))) ((< i 0)) (string-set! m i (ascii->char i))) m)) (define ccp:0 (make-ccp char-set:empty #t id-cmap #t)) (define ccp:1 (make-ccp char-set:full #t id-cmap #t)) (define ccp:upcase (proc->ccp char-upcase char-set:full)) (define ccp:downcase (proc->ccp char-downcase char-set:full))