1 ;;; dna-mode.el --- a major mode for editing dna sequences
3 ;; ~harley/share/emacs/pkg/dna/dna-mode.el ---
5 ;; $Id: dna-mode.el,v 1.40 2004/04/20 19:03:04 harley Exp $
8 ;; Author: Harley Gorrell <harley@panix.com>
9 ;; URL: http://www.mahalito.net/~harley/elisp/dna-mode.el
11 ;; Keywords: dna, emacs, editing
14 ;; * A collection of functions for editing DNA sequences. It
15 ;; provides functions to make editing dna in Emacs easier.
18 ;; * Fontify keywords and line numbers in sequences.
19 ;; * Fontify bases when font-lock-mode is disabled.
20 ;; * Incrementally search dna over pads and numbers.
21 ;; * Complement and reverse complement a region.
22 ;; * Move over bases and entire sequences.
23 ;; * Detect sequence files by content.
26 ;; --------------------
27 ;; Here are two suggested ways for installing this package.
28 ;; You can choose to autoload it when needed, or load it
29 ;; each time emacs is started. Put one of the following
30 ;; sections in your .emacs:
33 ;; (autoload 'dna-mode "dna-mode" "Major mode for dna" t)
36 ;; '("\\.\\(fasta\\|fa\\|exp\\|ace\\|gb\\)\\'" . dna-mode))
37 ;; (add-hook 'dna-mode-hook 'turn-on-font-lock)
40 ;; (setq dna-do-setup-on-load t)
41 ;; (load "/pathname/dna-mode")
43 ;; The dna-isearch-forward function (and isearch in general)
44 ;; is much more useful with something like the following:
45 ;; (make-face 'isearch)
46 ;; (set-face-background 'isearch "yellow")
47 ;; (setq-default search-highlight t)
50 ;; 2003-03-16: Updated URL and contact info
51 ;; 2004-04-20: Added dna-color-bases-region to the keymap for Mike.
53 ;;; User customizable vars start here
56 (defvar dna-mode-hook nil
57 "*Hook to setup `dna-mode'.")
59 (defvar dna-mode-load-hook nil
60 "*Hook to run when `dna-mode' is loaded.")
62 (defvar dna-setup-on-load nil
63 "*If not nil setup dna mode on load by running `dna-`add-hook's'.")
66 (defvar dna-valid-base-regexp
67 "[-*:acgtmrwsykvhdbxnACGTMRWSYKVHDBXN]"
68 "*A regexp which matches a single base.")
70 (defvar dna-base-complement-list
71 '((?- . ?-) (?n . ?n) (?* . ?*) (?x . ?x) (?: . ?:) ; identity
72 (?a . ?t) (?c . ?g) (?g . ?c) (?t . ?a) ; single
73 (?m . ?k) (?r . ?y) (?w . ?w) (?s . ?s) (?y . ?r) (?k . ?m) ; double
74 (?v . ?b) (?h . ?d) (?d . ?h) (?b . ?v) ; triple
76 "*List of bases and their complements.
77 Bases should be lowercase, as they are upcased when the `vector is made.")
79 ;; These are the colors used when coloring bases.
80 (defvar dna-base-color-a "blue")
81 (defvar dna-base-color-c "black")
82 (defvar dna-base-color-g "green")
83 (defvar dna-base-color-t "red")
86 (defvar dna-cruft-regexp "[* 0-9\t\n]"
87 "*Regexp to match cruft which may appear between bases.
88 Skip over it during dna-motion and dna-isearch.")
90 (defvar dna-isearch-case-fold-search t
91 "*Case fold dna-isearches if set.")
94 (defvar dna-sequence-start-regexp
95 "^\\(>\\|ID\\|LOCUS\\|DNA\\)"
96 "A regexp which matches the start of a sequence.")
98 ;;; End of user customizable vars
100 ;;; Start of internal vars and code
102 (defvar dna-base-complement-vector
103 (let ((c-vec (make-vector 256 nil))
104 (c-list dna-base-complement-list))
106 (aset c-vec (car (car c-list)) (cdr (car c-list)))
107 (aset c-vec (upcase (car (car c-list))) (upcase (cdr (car c-list))))
108 (setq c-list (cdr c-list)))
110 "A vector of upper and lower case bases and their complements.")
112 ;; I also use "Alt" as C-c is too much to type for cursor motions.
114 (let ((map (make-sparse-keymap)))
116 (define-key map "\C-c\C-f" 'dna-forward-base)
117 (define-key map "\C-cf" 'dna-forward-base)
118 (define-key map "\C-c\C-b" 'dna-backward-base)
119 (define-key map "\C-cb" 'dna-backward-base)
120 (define-key map "\C-c\C-s" 'dna-isearch-forward)
121 (define-key map "\C-cs" 'dna-isearch-forward)
122 (define-key map "\C-cr" 'dna-reverse-complement-region)
123 (define-key map "\C-cc" 'dna-complement-region)
124 (define-key map "\C-c#" 'dna-count-bases-region)
125 (define-key map "\M-\C-h" 'dna-mark-sequence)
126 (define-key map "\M-\C-a" 'dna-beginning-of-sequence)
127 (define-key map "\M-\C-e" 'dna-end-of-sequence)
129 (define-key map "\C-cg" 'dna-color-bases-region)
130 (define-key map "\C-cl" 'font-lock-mode)
131 ;; XEmacs does not like the Alt bindings
132 (when (not (string-match "XEmacs" (emacs-version)))
133 (define-key map [A-right] 'dna-forward-base)
134 (define-key map [A-left] 'dna-backward-base)
135 (define-key map [A-up] 'dna-beginning-of-sequence)
136 (define-key map [A-down] 'dna-end-of-sequence)
137 (define-key map [?\A-\C-s] 'dna-isearch-forward))
139 "The local keymap for `dna-mode'.")
143 "Major mode for editing DNA sequences.
145 This mode also customizes isearch to search over line breaks.
146 Use \\[universal-argument] Number as a prefix to dna-forward-base to move that
147 many bases. This skips line breaks and spaces.
149 dna-color-bases-region disables font-lock-mode automaticly
150 as they cant work together. \\[dna-color-bases-region] turns font-lock-mode back on.
155 (kill-all-local-variables)
156 (setq mode-name "dna")
157 (setq major-mode 'dna-mode)
158 (use-local-map dna-mode-map)
160 (make-local-variable 'font-lock-defaults)
161 (setq font-lock-defaults '(dna-font-lock-keywords))
163 (make-local-variable 'dna-valid-base-regexp)
164 (make-local-variable 'dna-sequence-start-regexp)
165 (make-local-variable 'dna-cruft-regexp)
166 (make-local-variable 'dna-isearch-case-fold-search)
168 (run-hooks 'dna-mode-hook))
171 ;; Todo: Seperate the keywords into a list for each format, rather
173 (defvar dna-font-lock-keywords
176 ("^\\(>\\)\\([-_.|a-zA-Z0-9]+\\)\\([ \t]+.*\\)?"
177 (1 font-lock-keyword-face)
178 (2 font-lock-function-name-face)
179 (3 font-lock-comment-face nil t))
182 ("^\\(ID\\) +\\([-_.a-zA-Z_0-9]+\\)"
183 (1 font-lock-keyword-face) (2 font-lock-function-name-face))
184 ("^\\(CC\\|SQ\\)\\([ \t]\\(.*\\)\\)?$"
185 (1 font-lock-keyword-face) (3 font-lock-comment-face nil t))
186 ("^\\(\\sw\\sw\\)[ \t]"
187 (1 font-lock-keyword-face))
189 (1 font-lock-keyword-face))
191 ;; Ace (phrap output)
192 ("^\\(DNA\\|Sequence\\|BaseQuality\\) +\\([-_.a-zA-Z_0-9]+\\)"
193 (1 font-lock-keyword-face) (2 font-lock-function-name-face))
196 ("^\\(LOCUS\\) +\\([-_.a-zA-Z_0-9]+\\)";; are '-_.' allowed?
197 (1 font-lock-keyword-face) (2 font-lock-function-name-face))
200 ; More genbank keywords...
201 "ACCESSION" "AUTHORS" "AUTHORS" "BASE COUNT" "DEFINITION"
202 "FEATURES" "JOURNAL" "JOURNAL" "KEYWORDS" "MEDLINE" "NID"
203 "ORGANISM" "REFERENCE" "SEGMENT" "SOURCE" "TITLE"
206 ("^[ \t]*\\([0-9]+\\)"
207 (1 font-lock-string-face))
211 "Expressions to hilight in `dna-mode'.")
215 (defun dna-find-file-func ()
216 "Invoke `dna-mode' if the buffer look like a sequence.
217 and another mode is not active.
218 This function is added to `find-file-hooks'."
219 (if (and (eq major-mode 'fundamental-mode)
220 (looking-at dna-sequence-start-regexp))
224 (defun dna-add-hooks ()
225 "Add a default set of dna-hooks.
226 These hooks will activate `dna-mode' when visiting a file
227 which has a dna-like name (.fasta or .gb) or whose contents
228 looks like dna. It will also turn enable fontification for `dna-mode'."
229 (add-hook 'dna-mode-hook 'turn-on-font-lock)
230 (add-hook 'find-file-hooks 'dna-find-file-func)
233 '("\\.\\(fasta\\|fa\\|exp\\|ace\\|gb\\)\\'" . dna-mode)))
235 ;; Setup hooks on request when this mode is loaded.
236 (if dna-setup-on-load
239 (defun dna-next-char-func ()
240 "Should never be called. Overridden in `dna-forward-base'."
241 (error "This shouldnt have been called"))
244 (defun dna-forward-base (count)
245 "Move forward COUNT bases. Move backward if negative.
246 Skip over dna-isearch-cruft. Stop on non-base or
247 non-whitespace characters."
250 (abscount (abs count))
251 (dir (if (< count 0) -1 1))
255 (fset 'dna-next-char-func (if (< dir 0) 'preceding-char 'following-char))
257 (while (< c abscount)
258 (setq bstr (char-to-string (dna-next-char-func)))
260 ((string-match dna-valid-base-regexp bstr)
263 ((string-match dna-cruft-regexp bstr)
266 (message "Moved %d bases forward." c)
267 (setq abscount c)))) ; stop the while
269 ;; Move over trailing junk when moving forward
271 (while (string-match dna-cruft-regexp
272 (char-to-string (dna-next-char-func)))
275 ;; return the distance moved
278 ;; aaaaaaaaaa cccccccccc | gggggggggg tttttttttt
280 (defun dna-backward-base (count)
281 "Move backward COUNT bases. See `dna-forward-base'."
283 (dna-forward-base (- count)))
285 (defun dna-beginning-of-sequence ()
286 "Move the start of the sequence or the buffer."
290 (search-backward-regexp dna-sequence-start-regexp (point-min) t)
293 (defun dna-end-of-sequence ()
294 "Move to the end of the sequence or the buffer."
297 (skip-syntax-forward "-")
299 (search-forward-regexp dna-sequence-start-regexp (point-max) t)))
303 (goto-char (point-max)))))
305 (defun dna-mark-sequence ()
306 "Put point at the beginning of a sequence, mark at end."
308 (dna-end-of-sequence)
310 (dna-beginning-of-sequence))
312 (defun dna-count-bases-region (d-start d-end)
313 "Count the number of bases in the region D-START to D-END.
314 Echos the number of bases counted.
315 If an invalid base is found, stops on the base and signals an error."
319 (while (< (point) d-end)
321 ((looking-at dna-valid-base-regexp)
322 (setq basecount (1+ basecount))
324 ((looking-at dna-cruft-regexp)
327 (error "Bad base found. '%s'"
328 (buffer-substring (point) (1+ (point)))))
330 (message "There are %d bases in the region." basecount)
333 ;;; reverse and complement
334 (defun dna-complement-base-list (base)
335 "Complement the BASE using a list based method.
336 Returns the complement of the base.
337 It can also be used to test if the character is a base,
338 as all bases should have a complement."
339 (cdr (assq base dna-base-complement-list)))
341 (defun dna-complement-base (base)
342 "Complement a BASE using a vector based method.
343 See `dna-complement-base-list' for more info."
344 (aref dna-base-complement-vector base))
346 (defun dna-complement (base)
347 "Look up the complement of the BASE and print a message.
348 Handy for us CS types."
349 (interactive "cComplement of base:")
350 (message "Complement of '%c' is '%c'." base (dna-complement-base base)))
352 (defun dna-complement-region (r-start r-end)
353 "Complement a region of bases from R-START to R-END.
354 Complement a region of the buffer by deleting it and
355 inserting the complements, base by base. Non-bases are
356 passed over unchanged."
358 (let (r-string r-length r-point r-base r-cbase)
360 (setq r-string (buffer-substring-no-properties r-start r-end))
361 (setq r-length (length r-string))
362 (delete-region r-start r-end)
364 (while (< r-point r-length)
365 (setq r-base (aref r-string r-point))
366 (setq r-cbase (dna-complement-base r-base))
367 (insert (if r-cbase r-cbase r-base))
368 (setq r-point (1+ r-point)))))
371 (defun dna-reverse-complement-region (r-start r-end)
372 "Reverse complement a region of dna from R-START to R-END.
373 Works by deleting the region and inserting bases reversed
374 and complemented, while entering non-bases in the order
377 (let (r-string r-length r-base r-cbase r-point r-mark)
379 (setq r-string (buffer-substring-no-properties r-start r-end))
380 (setq r-length (length r-string))
381 (setq r-mark (1- r-length))
385 (delete-region r-start r-end)
387 ;; insert the bases from back to front base by base
388 ;; insert non-bases from front to back to preserve spacing
389 (while (< r-point r-length)
390 (setq r-base (aref r-string r-point))
391 (setq r-cbase (dna-complement-base r-base))
394 ;; it is a base. find the reverse and complement it
395 (while (not (dna-complement-base (aref r-string r-mark)))
396 (setq r-mark (1- r-mark)))
397 (insert (dna-complement-base (aref r-string r-mark)))
398 (setq r-mark (1- r-mark)) )
399 ;; not a base, no change
401 (setq r-point (1+ r-point)))))
404 (defun dna-guess-format-func ()
405 "Guess the format of the sequence the point is at or after.
406 Returns the format or nil."
409 (dna-beginning-of-sequence)
411 ((looking-at "^>") 'fasta)
412 ((looking-at "^DNA") 'phrap)
413 ((looking-at "^ID") 'exp)
416 (defun dna-guess-format ()
417 "Guess and print the format of the sequence."
419 (message "%s" (dna-guess-format-func)))
421 ;;; dna-isearch stuff
422 (defun dna-isearch-mangle-str (str)
423 "Mangle the string STR into a regexp to search over cruft in sequence.
424 Inserts a regexp between each base which matches sequence formatting cruft.
425 For example, if `dna-cruft-regexp' is '[ ]',
426 the search string 'acgt' would transformed into 'a[ ]*c[ ]*g[ ]*t[ ]*'"
427 (let ((i 0) (out ""))
428 (while (< i (length str))
429 (setq out (concat out (substring str i (1+ i)) dna-cruft-regexp "*"))
433 (defadvice isearch-message-prefix (around dna-isearch-ismp)
434 "Set the isearch prompt string to show dna search is active.
435 This serves as a warning that the string is being mangled."
437 (setq ad-return-value (concat "DNA " ad-return-value)))
439 (defadvice isearch-search (around dna-isearch-iss)
440 "The advice used to mangle the search string in isearch."
441 (let ((isearch-regexp t)
442 ;; force case folding
443 (isearch-case-fold-search dna-isearch-case-fold-search)
444 (isearch-string (dna-isearch-mangle-str isearch-string)) )
448 (defun dna-isearch-forward ()
449 "Isearch forward on dna sequence.
450 Enable the `dna-mode' search string mangling advice and start the search."
453 (ad-enable-advice 'isearch-message-prefix 'around 'dna-isearch-ismp)
454 (ad-activate 'isearch-message-prefix)
455 ;; Enable the mangling
456 (ad-enable-advice 'isearch-search 'around 'dna-isearch-iss)
457 (ad-activate 'isearch-search)
463 (ad-disable-advice 'isearch-message-prefix 'around 'dna-isearch-ismp)
464 (ad-activate 'isearch-message-prefix)
466 (ad-disable-advice 'isearch-search 'around 'dna-isearch-iss)
467 (ad-activate 'isearch-search))
469 ;;; Work with columns of sequences.
471 (defun dna-column-select-func ()
472 "Return the start and end of the column as a cons.
473 Point is moved forward one."
477 (while (looking-at dna-valid-base-regexp)
482 (while (looking-at dna-valid-base-regexp)
486 ;; return the start and end of the column
489 (defun dna-column-select ()
490 "Select the current column of text.
491 Sets the mark at the top and the point at the bottom of a non-blank column."
493 (let ((se (dna-column-select-func)))
496 (goto-char (cdr se))))
498 (defvar dna-column-pad "*"
499 "Character to use when inserting a column of pads.")
501 (defun dna-column-insert-pad ()
502 "Insert a column of pads."
505 (let ((se (dna-column-select-func)))
506 (string-rectangle (car se) (cdr se) dna-column-pad))))
508 (defun dna-column-delete ()
509 "Delete the current column of dna."
512 (let ((se (dna-column-select-func)))
513 (kill-rectangle (car se) (cdr se)))))
517 (defun dna-base-color-make-faces (&optional force)
518 "Build a face to display bases with. FORCE remakes the faces."
519 (when (or (not (facep 'dna-face-t)) force)
520 (let ((base-list '("a" "c" "g" "t"))
523 (setq base (car base-list))
524 (setq base-face (intern (concat "dna-base-face-" base)))
525 (make-face base-face)
527 base-face (symbol-value (intern (concat "dna-base-color-" base))))
528 (setq base-list (cdr base-list))))))
530 ;; Make faces on load
531 (dna-base-color-make-faces t)
533 (defvar dna-color-bases-auto t
534 "Automaticly deactivate `font-lock-mode' when `dna-color-bases' is run.
535 See dna-color-bases for details.")
536 ;; (setq dna-color-bases-auto t)
538 (defun dna-color-bases-region (s e)
539 "Color the bases in the region S to E.
540 NOTE: The function `font-lock-mode' will undo the work of this
541 function if activated. Disable it before using this
542 function. If `dna-color-bases-auto' is set then `font-lock-mode'
543 is deactivated automatically."
545 (if (and dna-color-bases-auto font-lock-mode)
548 (error "Font-lock-mode is on -- deactivate it"))
553 (setq c (downcase (char-after s)))
556 (set-text-properties s (+ s 1) '(face dna-base-face-a)))
558 (set-text-properties s (+ s 1) '(face dna-base-face-c)))
560 (set-text-properties s (+ s 1) '(face dna-base-face-g)))
562 (set-text-properties s (+ s 1) '(face dna-base-face-t)))
566 (defun dna-uncolor-bases-region (s e)
567 "Uncolor the bases from S to E."
569 (remove-text-properties s e '(face nil)))
571 ;;; Functions for me.
573 ;; I like to datestamp sequences I work with.
574 (defvar dna-timestamp-format "%Y%m%d"
575 "Format of the time stamp which `dna-timestamp-seq' uses.")
577 (defun dna-timestamp-seq ()
578 "Insert the current date into the sequence.
579 Assumes fasta format."
582 (dna-beginning-of-sequence)
584 (insert " " (format-time-string dna-timestamp-format (current-time))))
587 (run-hooks 'dna-mode-load-hook)
590 ;;; dna-mode.el ends here