Nyquist / XLISP 2.0  -  Contents | Tutorials | Examples | Reference

Characters and Strings


Strings are also Sequences.


make-string


(defun make-string (length initial-element)
  (cond ((not (and (integerp length)
                   (plusp    length)))
         (error "not a positive integer" length))
        ((not (characterp initial-element))
         (error "not a character" initial-element))
        (t
         (let ((element (string initial-element))
               (string ""))
           (dotimes (x length)
             (setq string (strcat string element)))
           string))))

  Back to top


string*


(string* [expr1 ...])
exprN - arbitrary Lisp expressions
returns - the expression[s], converted and concatenated into a single string

The 'string*' function tries to make a string out of everything:

(defun string* (&rest items)
  (if (null items)
      ""
      (let ((end (length items))
            (result ""))
        (labels ((strcat-element (element)
                   (let ((string (if (or (consp element) (arrayp element))
                                     (string* element)
                                     (format nil "~a" element))))
                     (setq result (strcat result string)))))
          (dotimes (index end)
            (if (eq (nth index items) '*unbound*)
                (strcat-element "*UNBOUND*")
                (let ((item (nth index items)))
                  (case (type-of item)
                    (cons  (let ((end (length item)))
                             (when (not (consp (last item))) (incf end))
                             (dotimes (index end)
                               (if (eq (nth index item) '*unbound*)
                                   (strcat-element "*UNBOUND*")
                                   (strcat-element (nth index item))))))
                    (array (let ((end (length item)))
                             (dotimes (index end)
                               (if (eq (aref item index) '*unbound*)
                                   (strcat-element "*UNBOUND*")
                                   (strcat-element (aref item index))))))
                    (t     (strcat-element item))))))
          result))))

Examples:

(string*)              => ""

(string* #\A "B" 'c)   => "ABC"
(string* 1 2 3)        => "123"

(string* 1 "st")       => "1st"
(string* "2" #\n #\d)  => "2nd"

(setq a 3)             => 3
(string* 'a "=" a)     => "A=3"

Nested expressions will be flattened:

(string* #(1 (#\2) "3"))  => "123"

The result may contain nonsense:

(string* #'car)                    => "#<Subr-CAR: #8645768>"
(string* '(lambda (x) (print x)))  => "LAMBDAXPRINTX"

  Back to top


POSIX Character Classes


The built-in XLISP character test functions upper-case-p, lower-case-p, both-case-p, return the boolean values  T  or NIL instead of the tested character, while digit-char-p returns an integer or NIL, what is handy if you want to convert arbitrary Lisp symbols into numbers without producing an error, but all this is impractical for writing a string parser.

The following functions implement tests for the standard POSIX character classes, where all functions return the tested character if the test succeeds, or NIL if the test fails. The 'internal' functions do not check if the argument is a character and therefore are faster than the 'user' functions. Also note that XLISP is limited to ASCII characters, so there is no way to find out if an unicode character is upper- or lowercase if the character code is greater than ASCII 127.

   POSIX   -   Internal   -   User Function
   [: alnum :]   -   char:alnum-p   -   alnum-character-p   -  alphanumeric = [a-z], [A-Z], [0-9]
   [: alpha :]   -   char:alpha-p   -   alpha-character-p   -  alphabetic = [a-z], [A-Z]
   [: blank :]   -   char:blank-p   -   blank-character-p   -  space and horizontal-tab
   [: cntrl :]   -   char:cntrl-p   -   cntrl-character-p   -  code-chars 0-31 and 127
   [: digit :]   -   char:digit-p   -   digit-character-p   -  decimal = [0-9]
   [: graph :]   -   char:graph-p   -   graph-character-p   -  graphical = alnum + punct
   [: lower :]   -   char:lower-p   -   lower-character-p   -  lowercase = [a-z]
   [: print :]   -   char:print-p   -   print-character-p   -  printable = alnum + punct + space
   [: punct :]   -   char:punct-p   -   punct-character-p   -  punctuation marks
   [: space :]   -   char:space-p   -   space-character-p   -  characters producing whitespace
   [: upper :]   -   char:upper-p   -   upper-character-p   -  uppercase = [A-Z]
   [: xdigit :]   -   char:xdigit-p   -   xdigit-character-p   -  hexadecimal = [0-9], [a-f], [A-F]

Internal Functions for POSIX character classes:

;; alphanumeric characters = a-z, A-z, 0-9

(defun char:alnum-p (char)
  (and (alphanumericp char)
       char))

;; alphabetic characters = a-z, A-Z

(defun char:alpha-p (char)
  (and (both-char-p char)
       char))

;; blanks = space and horizontal-tab

(defun char:blank-p (char)
  (and (or (char= char #\Space)
           (char= char #\Tab))
       char))

;; control characters = code-chars 0-31 and 127

(defun char:cntrl-p (char)
  (let ((code (char-code char)))
    (and (or (<= 0 code 31)
             (= code 127))
         char)))

;; decimal digits = 0-9

(defun char:digit-p (char)
  (and (digit-char-p char)
       char))

;; graphical characters = alnum + punct

(defun char:graph-p (char)
  (and (<= 33 (char-code char) 126)
       char))

;; lowercase characters = a-z

(defun char:lower-p (char)
  (and (lower-case-p char)
       char))

;; printable characters = alnum + punct + space

(defun char:print-p (char)
  (and (<= 32 (char-code char) 126)
       char))

;; punctuation marks

(defun char:punct-p (char)
  (let ((code (char-code char)))
    (and (or (<=  33 code  47)   ;  ! " # $ % & ' ( ) * + , - . /
             (<=  58 code  64)   ;  : ; < = > ? @
             (<=  91 code  96)   ;  [ \ ] ^ _ `
             (<= 123 code 126))  ;  { | } ~
         char)))

;; characters producing whitespace
;;
;;  9 = horizontal tab   10 = line feed         11 = vertical tab
;; 12 = form feed        13 = carriage return   32 = space

(defun char:space-p (char)
  (and (member (char-code char) '(9 10 11 12 13 32))
       char))

;; uppercase characters = A-Z

(defun char:upper-p (char)
  (and (upper-case-p char)
       char))

;; hexadecimal digits = 0-9, a-f, A-F

(defun char:xdigit-p (char)
  (and (or (digit-char-p char)
           (let ((code (char-code char)))
             (or (<= 65 code  70)     ; A-Z
                 (<= 97 code 102))))  ; a-z
       char))

User Functions for POSIX character classes:

;; alphanumeric characters = a-z, A-z, 0-9

(defun alnum-character-p (char)
  (and (characterp char)
       (char:alnum-p char)))

;; alphabetic characters = a-z, A-Z

(defun alpha-character-p (char)
  (and (characterp char)
       (char:alpha-p char)))

;; blanks = space and horizontal-tab

(defun blank-character-p (char)
  (and (characterp char)
       (char:blank-p char)))

;; control characters = code-chars 0-31 and 127

(defun cntrl-character-p (char)
  (and (characterp char)
       (char:cntrl-p char)))

;; decimal digits = 0-9

(defun digit-character-p (char)
  (and (characterp char)
       (char:digit-p char)))

;; graphical characters = alnum + punct

(defun graph-character-p (char)
  (and (characterp char)
       (char:graph-p char)))

;; lowercase characters = a-z

(defun lower-character-p (char)
  (and (characterp char)
       (char:lower-p char)))

;; printable characters = alnum + punct + space

(defun print-character-p (char)
  (and (characterp char)
       (char:print-p char)))

;; punctuation marks

(defun punct-character-p (char)
  (and (characterp char)
       (char:punct-p char)))

;; characters producing whitespace

(defun space-character-p (char)
  (and (characterp char)
       (char:space-p char)))

;; uppercase characters = A-Z

(defun upper-character-p (char)
  (and (characterp char)
       (char:upper-p char)))

;; hexadecimal digits = 0-9, a-f, A-F

(defun xdigit-character-p (char)
  (and (characterp char)
       (char:xdigit-p char)))

  Back to top


Unicode


The UTF-8 functions may help to write custom UTF-8 string access functions like UTF-8-SUBSEQ or UTF-8-STRING-SEARCH with no need to care about the underlying low-level octal sequences.

In the list of "string-characters" every ASCII or UTF-8 character from 1-byte to 4-byte is represented by its own list element:

(utf-8-string-to-list "hällö") => ("h" "\303\244" "l" "l" "\303\266")
                                    h       ä      l   l       ö

The list can be manipulated by standard Nyquist list functions and then re-converted into a string by UTF-8-LIST-TO-STRING.

Practical examples

In Nyquist code, non-ASCII characters are represented by their native bytes sequences, represented by escaped octal numbers:

(print "ä")  => "\303\244"  ; on UTF-8 systems

So for example matching the second "ä" from "hällo" in the list above, represented by the octal sequence "\303\244":

(let ((string-list (utf-8-string-to-list "hällö")))
  (string= "ä" (nth 1 string-list)))  ; 0 = first, 1 = second element
=> T                                  ; T = true = identified

Advantage: The number of the element in the list is the same as the number of the character in the string, independent from the number of bytes in the underlying character encoding.

;; The UTF-8 toolbox is intended to manipulate UTF-8 encoded file- ;; or directory names, typed in by the user or read from environment ;; variables, before they are given to SETDIR or OPEN. ;; ;; Information from the environment ;; ;; Because the encoding of the non-ASCII characters depends on the ;; underlying operation system [with non-unicode operation systems ;; there will be no UTF-8 encoding available], it's always better ;; to refer to strings from environment variables, user input, or ;; strings returned from the underlying file system, instead of ;; hand-coded strings in the Nyquist source code, for example: ;; ;; GET-ENV - can read strings from environment variables: ;; ;; (defun user-home-directory () ;; (or (get-env "HOME") ; Unix ;; (get-env "UserProfile"))) ; Windows ;; ;; On Windows, there is no HOME variable defined by Windows itself, ;; but most programs will respect a HOME variable, if one has been ;; defined by the user. That's why the HOME variable is read first. ;; ;; SETDIR - can test if a directory exists and return its name: ;; ;; (defun directory-exists-p (string) ;; (let ((orig-dir (setdir ".")) ;; (new-dir (setdir string))) ;; (when (string/= orig-dir new-dir) ;; (setdir orig-dir) ;; new-dir))) ;; ;; SETDIR always returns abloute direcory names, even if STRING is a ;; relative direcory name. That's why DIRECTORY-EXISTS-P first stores ;; the absolute name of the current working directory in the ORIG-DIR ;; variable and compares it then against the absolute directory name ;; returned by SETDIR when it tries to change the directory to STRING. ;; ;; OPEN - can test if a file exists and return its name: ;; ;; (defun file-exists-p (string) ;; (unless (directory-exists-p string) ;; (let (file-stream) ;; (unwind-protect ;; (setq file-stream (open string)) ;; (when file-stream (close file-stream))) ;; (when file-stream string)))) ;; ;; On Unix, a directory is a special kind of file, so the Nyquist/XLISP ;; OPEN function opens directories, too. That's why FILE-EXISTS-P first ;; must test and make sure that STRING is not the name of a directory. ;; ;;; Known bugs and limitations of the UTF-8 toolbox: ;; ;; The UTF-8 toolbox does not provide support for UTF-8 case-detection ;; or UTF-8 case-conversion. It cannot be detected if a UTF-8 character ;; is upper- or lowercase, it's also not possible to convert characters ;; from upper- to lowercase or vice versa. ;; ;; The library does not provide functions to compare UTF-8 characters ;; or to sort UTF-8 characters. ;; ;; The XLISP character functions do not work with UTF-8 octal sequences, ;; so matching must be done via XLISP's STRING= and STRING/= functions. ;; ;; The XLISP string comparison functions like STRING<, STRING>, etc. ;; do not work reliably with multibyte characters. ;; ;; The string matching and sorting algorithms of the Unicode Consortium ;; are too complex to be implemented in XLISP with reasonable speed. ;; ;; See: http://www.unicode.org/reports/tr10/ - string comparison ;; ;; The library is implemented in interpreted Lisp, so please do not ;; expect high-speed performance with advanced list manipulations. ;; ;; The library still has not been tested with ISO encoded systems. ;;

UTF-8 Encoding - see also http://en.wikipedia.org/wiki/UTF-8

In an UTF-8 encoded character the first byte starts with:

;;    one-byte  0xxxxxxx -> legal char-codes   0 to 127  [UTF-8/ASCII]
;;    two-byte  110xxxxx -> legal char-codes 194 to 223  [UTF-8]
;;  three-byte  1110xxxx -> legal char-codes 224 to 239  [UTF-8]
;;   four-byte  11110xxx -> legal char-codes 240 to 244  [UTF-8]
;;
;; The second, third, and fourth characters start with:
;;
;;              10xxxxxx -> legal char-codes 128 to 191  [UTF-8]

UTF-8-BYTE-P tests if a XLISP character is a valid UTF-8 byte

(defun utf-8-byte-p (char)
  (when (characterp char)
    (let ((code (char-code char)))
      (when (or (<=   0 code 191)
                (<= 194 code 244))  
        char))))

UTF-8-BYTES tries to determine from the XLISP character code how many bytes the character has in UTF-8 encoding

(defun utf-8-bytes (char)
  (cond ((not (characterp char))
         (error "not a character" char))
        ((not (utf-8-byte-p char))
         (error "invalid UTF-8 byte" char))
        (t
         (let ((code (char-code char)))
           (cond ((<=   0 code 127) 1)  ; one byte [= ASCII]
                 ((<= 194 code 223) 2)  ; two bytes
                 ((<= 224 code 239) 3)  ; three bytes
                 ((<= 240 code 244) 4)  ; four bytes
                 (t (error "utf-8-bytes: not an UTF-8 identifer" char)))))))

UTF-8-STRING-TO-LIST converts a string containing ASCII or UTF-8 characters from one to four bytes into a list, where:

;; Every character (single-byte or multi-byte) is represented ;; by its own list element: ;; ;; (utf-8-string-to-list "hällö") => ("h" "\303\244" "l" "l" "\303\266") ;; h ä l l ö ;; ;; The list can be manipulated by standard XLISP list functions and ;; then re-converted into a string by UTF-8-LIST-TO-STRING below.
(defun utf-8-string-to-list (string)
  (cond
    ((not (stringp string))
     (error "utf-8-string-to-list: not a string" string))
    ((string= "" string) nil)
    (t
     (let ((end (length string))
           (list nil))
       (do ((index 0 (1+ index)))
           ((>= index end))
         (let* ((char (char string index))
                (bytes (1- (utf-8-bytes char)))
                (utf-8 (string char)))
           (dotimes (rest-of bytes) ; runs only if bytes > 0
             (incf index)
             (if (> index end)
                 (error "utf-8-string-to-list: index out of range" index)
                 (let ((byte (char string index)))
                   (if (not (utf-8-byte-p byte))
                       (error "utf-8-string-to-list: invalid UTF-8 byte" byte)
                       (setq utf-8 (strcat utf-8 (string byte)))))))
           (push utf-8 list)))
       (reverse list)))))
;; UTF-8-LIST-TO-STRING re-converts a list containing ASCII and ;; UTF-8 "string-characters" back to a XLISP string, intended ;; to be given to SETDIR or OPEN for file or directory operations.
(defun utf-8-list-to-string (list)
  (cond ((not (listp list))
         (error "utf-8-list-to-string: not a list" list))
        ((null list) "")
        (t
         (let ((result ""))
           (dolist (string list)
             (if (not (stringp string))
                 (error "utf-8-list-to-string: not a string" string)
                 (setq result (strcat result string))))
           result))))

  Back to top


Nyquist / XLISP 2.0  -  Contents | Tutorials | Examples | Reference