194 lines
4.2 KiB
Go
194 lines
4.2 KiB
Go
|
// Package shlex provides a simple lexical analysis like Unix shell.
|
||
|
package shlex
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"errors"
|
||
|
"io"
|
||
|
"strings"
|
||
|
"unicode"
|
||
|
)
|
||
|
|
||
|
var (
|
||
|
ErrNoClosing = errors.New("No closing quotation")
|
||
|
ErrNoEscaped = errors.New("No escaped character")
|
||
|
)
|
||
|
|
||
|
// Tokenizer is the interface that classifies a token according to
|
||
|
// words, whitespaces, quotations, escapes and escaped quotations.
|
||
|
type Tokenizer interface {
|
||
|
IsWord(rune) bool
|
||
|
IsWhitespace(rune) bool
|
||
|
IsQuote(rune) bool
|
||
|
IsEscape(rune) bool
|
||
|
IsEscapedQuote(rune) bool
|
||
|
}
|
||
|
|
||
|
// DefaultTokenizer implements a simple tokenizer like Unix shell.
|
||
|
type DefaultTokenizer struct{}
|
||
|
|
||
|
func (t *DefaultTokenizer) IsWord(r rune) bool {
|
||
|
return r == '_' || unicode.IsLetter(r) || unicode.IsNumber(r)
|
||
|
}
|
||
|
func (t *DefaultTokenizer) IsQuote(r rune) bool {
|
||
|
switch r {
|
||
|
case '\'', '"':
|
||
|
return true
|
||
|
default:
|
||
|
return false
|
||
|
}
|
||
|
}
|
||
|
func (t *DefaultTokenizer) IsWhitespace(r rune) bool {
|
||
|
return unicode.IsSpace(r)
|
||
|
}
|
||
|
func (t *DefaultTokenizer) IsEscape(r rune) bool {
|
||
|
return r == '\\'
|
||
|
}
|
||
|
func (t *DefaultTokenizer) IsEscapedQuote(r rune) bool {
|
||
|
return r == '"'
|
||
|
}
|
||
|
|
||
|
// Lexer represents a lexical analyzer.
|
||
|
type Lexer struct {
|
||
|
reader *bufio.Reader
|
||
|
tokenizer Tokenizer
|
||
|
posix bool
|
||
|
whitespacesplit bool
|
||
|
}
|
||
|
|
||
|
// NewLexer creates a new Lexer reading from io.Reader. This Lexer
|
||
|
// has a DefaultTokenizer according to posix and whitespacesplit
|
||
|
// rules.
|
||
|
func NewLexer(r io.Reader, posix, whitespacesplit bool) *Lexer {
|
||
|
return &Lexer{
|
||
|
reader: bufio.NewReader(r),
|
||
|
tokenizer: &DefaultTokenizer{},
|
||
|
posix: posix,
|
||
|
whitespacesplit: whitespacesplit,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// NewLexerString creates a new Lexer reading from a string. This
|
||
|
// Lexer has a DefaultTokenizer according to posix and whitespacesplit
|
||
|
// rules.
|
||
|
func NewLexerString(s string, posix, whitespacesplit bool) *Lexer {
|
||
|
return NewLexer(strings.NewReader(s), posix, whitespacesplit)
|
||
|
}
|
||
|
|
||
|
// Split splits a string according to posix or non-posix rules.
|
||
|
func Split(s string, posix bool) ([]string, error) {
|
||
|
return NewLexerString(s, posix, true).Split()
|
||
|
}
|
||
|
|
||
|
// SetTokenizer sets a Tokenizer.
|
||
|
func (l *Lexer) SetTokenizer(t Tokenizer) {
|
||
|
l.tokenizer = t
|
||
|
}
|
||
|
|
||
|
func (l *Lexer) Split() ([]string, error) {
|
||
|
result := make([]string, 0)
|
||
|
for {
|
||
|
token, err := l.readToken()
|
||
|
if token != "" {
|
||
|
result = append(result, token)
|
||
|
}
|
||
|
|
||
|
if err == io.EOF {
|
||
|
break
|
||
|
} else if err != nil {
|
||
|
return result, err
|
||
|
}
|
||
|
}
|
||
|
return result, nil
|
||
|
}
|
||
|
|
||
|
func (l *Lexer) readToken() (string, error) {
|
||
|
t := l.tokenizer
|
||
|
token := ""
|
||
|
quoted := false
|
||
|
state := ' '
|
||
|
escapedstate := ' '
|
||
|
scanning:
|
||
|
for {
|
||
|
next, _, err := l.reader.ReadRune()
|
||
|
if err != nil {
|
||
|
if t.IsQuote(state) {
|
||
|
return token, ErrNoClosing
|
||
|
} else if t.IsEscape(state) {
|
||
|
return token, ErrNoEscaped
|
||
|
}
|
||
|
return token, err
|
||
|
}
|
||
|
|
||
|
switch {
|
||
|
case t.IsWhitespace(state):
|
||
|
switch {
|
||
|
case t.IsWhitespace(next):
|
||
|
break scanning
|
||
|
case l.posix && t.IsEscape(next):
|
||
|
escapedstate = 'a'
|
||
|
state = next
|
||
|
case t.IsWord(next):
|
||
|
token += string(next)
|
||
|
state = 'a'
|
||
|
case t.IsQuote(next):
|
||
|
if !l.posix {
|
||
|
token += string(next)
|
||
|
}
|
||
|
state = next
|
||
|
default:
|
||
|
token = string(next)
|
||
|
if l.whitespacesplit {
|
||
|
state = 'a'
|
||
|
} else if token != "" || (l.posix && quoted) {
|
||
|
break scanning
|
||
|
}
|
||
|
}
|
||
|
case t.IsQuote(state):
|
||
|
quoted = true
|
||
|
switch {
|
||
|
case next == state:
|
||
|
if !l.posix {
|
||
|
token += string(next)
|
||
|
break scanning
|
||
|
} else {
|
||
|
state = 'a'
|
||
|
}
|
||
|
case l.posix && t.IsEscape(next) && t.IsEscapedQuote(state):
|
||
|
escapedstate = state
|
||
|
state = next
|
||
|
default:
|
||
|
token += string(next)
|
||
|
}
|
||
|
case t.IsEscape(state):
|
||
|
if t.IsQuote(escapedstate) && next != state && next != escapedstate {
|
||
|
token += string(state)
|
||
|
}
|
||
|
token += string(next)
|
||
|
state = escapedstate
|
||
|
case t.IsWord(state):
|
||
|
switch {
|
||
|
case t.IsWhitespace(next):
|
||
|
if token != "" || (l.posix && quoted) {
|
||
|
break scanning
|
||
|
}
|
||
|
case l.posix && t.IsQuote(next):
|
||
|
state = next
|
||
|
case l.posix && t.IsEscape(next):
|
||
|
escapedstate = 'a'
|
||
|
state = next
|
||
|
case t.IsWord(next) || t.IsQuote(next):
|
||
|
token += string(next)
|
||
|
default:
|
||
|
if l.whitespacesplit {
|
||
|
token += string(next)
|
||
|
} else if token != "" {
|
||
|
l.reader.UnreadRune()
|
||
|
break scanning
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return token, nil
|
||
|
}
|