;; This code derived from PLT Scheme. ;; The license is LGPL. ;; Since Matthew implemented this module, tests are in ;; plt/collects/tests/mzscheme/char-set.ss. ;; Since Mike Sperber looked carefully at this module, ;; the code and tests are a lot better than they would be. ;;(module char-set mzscheme ;; (require (lib "integer-set.ss") ;; (all-except (lib "contract.ss") union) ;; (rename (lib "contract.ss") union/c union)) (define null '()) (load "integer-set.scm") ;; This definition doesn't work for some reason in integer-set.scm. ;; Doesn't work here either - got another below. (define union merge) ;; Data defn ---------------------------------------- (define-record-type char-set (make-char-set set/thunk) char-set? (set/thunk char-set-set/thunk char-set-set/thunk!)) (define (fold-set op init l) (if (null? l) init (fold-set op (op init (car l)) (cdr l)))) (define (char-set-set cs) (let ((v (char-set-set/thunk cs))) (if (procedure? v) (let ((v2 (v))) (char-set-set/thunk! cs v2) v2) v))) ;; General procedures ---------------------------------------- (define char-set= (case-lambda (() #t) ((cs) #t) ((cs1 cs2) (equal? (integer-set-contents (char-set-set cs1)) (integer-set-contents (char-set-set cs2)))) ((cs1 . rest) (fold-set (lambda (v cs) (and v (char-set= cs1 cs))) #t rest)))) (define char-set<= (case-lambda (() #t) ((cs) #t) ((cs1 cs2) (subset? (char-set-set cs1) (char-set-set cs2))) ((cs1 . rest) (and (fold-set (lambda (cs1 cs) (and cs1 (char-set<= cs1 cs) cs)) cs1 rest) #t)))) (define char-set-hash (case-lambda ((cs) (abs (equal-hash-code (char-set-set cs)))) ((cs bound) (modulo (char-set-hash cs) bound)))) ;; Iterating over character sets ---------------------------------------- ;; A cursor is (cons num (list-of (cons start-num end-num))) ;; where the first num indicates how far we are into the ;;; first range of the cdr of the cursor. (define (char-set-cursor cs) (cons 0 (integer-set-contents (char-set-set cs)))) (define (char-set-ref cs c) (integer->char (+ (car c) (caadr c)))) (define (char-set-cursor-next cs c) (let ((d (- (cdadr c) (caadr c)))) (if (= d (car c)) (cons 0 (cddr c)) (cons (add1 (car c)) (cdr c))))) (define (end-of-char-set? c) (null? (cdr c))) (define (char-set-fold/done kons knil cs done?) (let loop ((v knil)(l (integer-set-contents (char-set-set cs)))) (if (null? l) v (let ((end (cdar l))) (let iloop ((v v) (i (caar l))) (if (> i end) (loop v (cdr l)) (let ((v (kons (integer->char i) v))) (if (done? v) v (iloop v (add1 i)))))))))) (define (char-set-fold kons knil cs) (char-set-fold/done kons knil cs (lambda (x) #f))) (define char-set-unfold (case-lambda ((p f g seed) (char-set-unfold p f g seed char-set:empty)) ((p f g seed base-cs) ;; Implementation taken directly from SRFI-14: (let lp ((seed seed) (cs base-cs)) (if (p seed) cs ; P says we are done. (lp (g seed) ; Loop on (G SEED). (char-set-adjoin! cs (f seed)))))))) (define (char-set-unfold! p f g seed base-cs) (char-set-unfold p f g seed base-cs)) (define (char-set-for-each proc cs) (char-set-fold (lambda (c v) (proc c)) (void) cs)) (define (char-set-map proc cs) ;; Note: no order defined on cs traversal, so it doesn't ;; matter that we build up the list backward (char-set-fold (lambda (c v) (char-set-adjoin v (proc c))) char-set:empty cs)) ;; Creating character sets ---------------------------------------- (define (char-set-copy cs) ;; Our char sets are purely functional: cs) (define mk-char-set (let ((char-set (lambda more (list->char-set more char-set:empty)))) char-set)) (define list->char-set (case-lambda ((l) (list->char-set l char-set:empty)) ((l cs) (fold-set char-set-adjoin cs l)))) (define (list->char-set! l cs) (list->char-set l cs)) (define string->char-set (case-lambda ((s) (string->char-set s char-set:empty)) ((s cs) (list->char-set (string->list s) cs)))) (define (string->char-set! s cs) (string->char-set s cs)) (define char-set-filter (case-lambda ((pred cs) (char-set-filter pred cs char-set:empty)) ((pred cs base-cs) (char-set-fold (lambda (c v) (if (pred c) (char-set-adjoin v c) v)) base-cs cs)))) (define (char-set-filter! pred cs base-cs) (char-set-filter pred cs base-cs)) (define ucs-range->char-set (case-lambda ((lower upper) (ucs-range->char-set lower upper #f char-set:empty)) ((lower upper error?) (ucs-range->char-set lower upper error? char-set:empty)) ((lower upper error? cs) (when (or (< lower 0) (> upper #x110000) (>= lower upper)) (raise (make-exn:fail:contract (string->immutable-string (format "ucs-range->char-set: invalid range: (~a, ~a)" lower upper)) (current-continuation-marks)))) (char-set-union cs (cond ((and (<= upper #xE000) (>= lower #xD800)) ;; Completely in the hole char-set:empty) ((<= upper #xE000) ;; Below the hole (make-char-set (make-integer-set (list (cons lower (sub1 (min #xD800 upper))))))) ((>= lower #xD800) ;; Above the hole (make-char-set (make-integer-set (list (cons (max #xE000 lower) (sub1 upper)))))) (else ;; Spans the hole: (make-char-set (make-integer-set (list (cons lower #xD7FF) (cons #xE000 (sub1 upper))))))))))) (define (ucs-range->char-set! lower upper error? base-cs) (ucs-range->char-set lower upper error? base-cs)) (define (->char-set x) (cond ((char? x) (let ((v (char->integer x))) (make-char-set (make-integer-set (list (cons v v)))))) ((string? x) (string->char-set x)) ((char-set? x) x) (else (raise-type-error '->char-set "character, string, or char-set" x)))) ;; Querying character sets ---------------------------------------- (define (char-set-size cs) (let loop ((l (integer-set-contents (char-set-set cs)))(c 0)) (if (null? l) c (loop (cdr l) (+ c 1 (- (cdar l) (caar l))))))) (define (char-set-count pred cs) (char-set-fold (lambda (c v) (+ v (if (pred c) 1 0))) 0 cs)) (define (char-set->list cs) (char-set-fold cons null cs)) (define (char-set->string cs) (list->string (char-set->list cs))) (define (char-set-contains? cs char) (member? (char->integer char) (char-set-set cs))) (define (char-set-every pred cs) (char-set-fold/done (lambda (c v) (and v (pred c))) #t cs not)) (define (char-set-any pred cs) (char-set-fold/done (lambda (c v) (or v (pred c))) #f cs values)) ;; Character-set algebra ---------------------------------------- (define char-set-adjoin (case-lambda ((cs char1) (let ((v (char->integer char1))) (make-char-set (union (char-set-set cs) (make-integer-set (list (cons v v))))))) ((cs . more) (fold-set char-set-adjoin cs more)))) (define char-set-delete (case-lambda ((cs char1) (let ((v (char->integer char1))) (make-char-set (difference (char-set-set cs) (make-integer-set (list (cons v v))))))) ((cs . more) (fold-set char-set-delete cs more)))) (define (char-set-complement cs) (make-char-set (union (complement (char-set-set cs) #x0 #xD7FF) (complement (char-set-set cs) #xE000 #x10FFFF)))) (define-syntax define-set-op (syntax-rules () ((define-set-op char-set-op set-op neutral) (define char-set-op (case-lambda ((cs1 cs2) (make-char-set (set-op (char-set-set cs1) (char-set-set cs2)))) (() neutral) ((cs1 . more) (fold-set char-set-op cs1 more))))))) (define-set-op char-set-union union char-set:empty) (define-set-op char-set-intersection intersect char-set:full) (define char-set-difference (case-lambda ((cs1 cs2) (make-char-set (difference (char-set-set cs1) (char-set-set cs2)))) ((cs1 . more) (fold-set char-set-difference cs1 more)))) (define-set-op char-set-xor xor char-set:empty) (define char-set-diff+intersection (case-lambda ((cs1 cs2) (let-values (((cs1^cs2 cs1-cs2 cs2-cs1) (split (char-set-set cs1) (char-set-set cs2)))) (values (make-char-set cs1-cs2) (make-char-set cs1^cs2)))) ((cs1 cs2 . more) (let-values (((d i) (char-set-diff+intersection cs1 cs2))) (values (apply char-set-difference d more) (apply char-set-intersection i more)))))) (define char-set-adjoin! char-set-adjoin) (define char-set-delete! char-set-delete) (define char-set-complement! char-set-complement) (define char-set-union! char-set-union) (define char-set-intersection! char-set-intersection) (define char-set-difference! char-set-difference) (define char-set-xor! char-set-xor) (define char-set-diff+intersection! char-set-diff+intersection) ;; ---------------------------------------- ;; This definition doesn't work for some reason in integer-set.scm. (define union merge) ;; Unicode goes to #x10FFFF but Java only has 16bit chars. (define-constant max-char (char->integer (static-field "MAX_VALUE"))) ;; Can't use the recursive version. Even runs out of memory with full-tailcalls. (define (make-standard-setx pred?) (make-char-set (lambda () (let loop ((i 0) (result (make-range))) (cond ((> i max-char) result) ((and (not (and (>= i #xD800) (<= i #xDFFF))) (pred? (integer->char i))) (do ((j (add1 i) (add1 j))) ((or (> j max-char) (not (pred? (integer->char j)))) (loop (add1 j) (union (make-range i (sub1 j)) result))))) (else (loop (add1 i) result)) ) ) ) ) ) (define (make-standard-set pred?) (make-char-set (lambda () (do ((next 0) (i 0 next) (result (make-range))) ((> i max-char) result) (if (and (not (and (>= i #xD800) (<= i #xDFFF))) (pred? (integer->char i))) (do ((j (add1 i) (add1 j))) ((or (> j max-char) (not (pred? (integer->char j)))) (set! result (union (make-range i (sub1 j)) result)) (set! next (add1 j)) )) (set! next (add1 i))) ) ) ) ) (define (char-title-case? (char )) (invoke-static "isTitleCase" char)) (define (char-iso-control? (char )) (invoke-static "isISOControl" char)) (define (char-punctuation? (char )) (let ((ctype (invoke-static "getType" char))) (memq ctype (list (static-field "CONNECTOR_PUNCTUATION") (static-field "DASH_PUNCTUATION") (static-field "END_PUNCTUATION") (static-field "FINAL_QUOTE_PUNCTUATION") (static-field "INITIAL_QUOTE_PUNCTUATION") (static-field "OTHER_PUNCTUATION") (static-field "START_PUNCTUATION") )) )) (define (char-symbolic? (char )) (let ((ctype (invoke-static "getType" char))) (memq ctype (list (static-field "CURRENCY_SYMBOL") (static-field "MATH_SYMBOL") (static-field "MODIFIER_SYMBOL") (static-field "OTHER_SYMBOL") )) )) (define (char-blank? (char )) (or (memq (char->integer char) '(#x09 #xA0)) (eq? (invoke-static "getType" char) (static-field "SPACE_SEPARATOR")) )) ;; Need to define a correct char-whitespace? since Java's isn't really quite right. (define-constant char-set:lower-case (make-standard-set char-lower-case?)) (define-constant char-set:upper-case (make-standard-set char-upper-case?)) (define-constant char-set:title-case (make-standard-set char-title-case?)) (define-constant char-set:letter (make-standard-set char-alphabetic?)) (define-constant char-set:digit (make-standard-set char-numeric?)) (define-constant char-set:whitespace (make-standard-set char-whitespace?)) (define-constant char-set:iso-control (make-standard-set char-iso-control?)) (define-constant char-set:punctuation (make-standard-set char-punctuation?)) (define-constant char-set:symbol (make-standard-set char-symbolic?)) (define-constant char-set:blank (make-standard-set char-blank?)) (define-constant char-set:ascii (make-char-set (make-integer-set '((0 . 127))))) (define-constant char-set:hex-digit (make-char-set (make-integer-set '((48 . 57) (65 . 70) (97 . 102))))) (define-constant char-set:empty (make-char-set (make-integer-set '()))) (define-constant char-set:full (make-char-set (make-integer-set (list '(#x0 . #xD7FF) (cons #xE000 max-char))))) (define-constant char-set:letter+digit (char-set-union char-set:letter char-set:digit)) (define-constant char-set:graphic (char-set-union char-set:letter+digit char-set:punctuation char-set:symbol)) (define-constant char-set:printing (char-set-union char-set:whitespace char-set:graphic))