diff --git a/kittens/ssh/utils_test.go b/kittens/ssh/utils_test.go index 2d9990ca3..e76c181f1 100644 --- a/kittens/ssh/utils_test.go +++ b/kittens/ssh/utils_test.go @@ -28,6 +28,9 @@ func TestParseSSHArgs(t *testing.T) { if err != nil { t.Fatal(err) } + if len(ans) == 0 { + ans = []string{} + } return ans } @@ -39,7 +42,7 @@ func TestParseSSHArgs(t *testing.T) { check := func(a, b any) { diff := cmp.Diff(a, b) if diff != "" { - t.Fatalf("Unexpected value for args: %s\n%s", args, diff) + t.Fatalf("Unexpected value for args: %#v\n%s", args, diff) } } check(split(expected_ssh_args), ssh_args) diff --git a/kitty/shlex.c b/kitty/shlex.c index c4facff65..fe159a21f 100644 --- a/kitty/shlex.c +++ b/kitty/shlex.c @@ -109,7 +109,6 @@ next_word(Shlex *self, PyObject *args UNUSED) { switch(ch) { case STRING_WITHOUT_ESCAPES_DELIM: set_state(self, WORD); - if (self->buf_pos && self->state == NORMAL) return get_word(self); break; default: write_ch(self, ch); break; } break; @@ -117,13 +116,9 @@ next_word(Shlex *self, PyObject *args UNUSED) { switch(ch) { case STRING_WITH_ESCAPES_DELIM: set_state(self, WORD); - if (self->buf_pos && self->state == NORMAL) return get_word(self); break; case ESCAPE_CHAR: - if (self->src_pos < self->src_sz) { - Py_UCS4 nch = PyUnicode_READ(self->kind, self->src_data, self->src_pos); self->src_pos++; - write_ch(self, nch); - } + write_escape_ch(self); break; default: write_ch(self, ch); break; } break; diff --git a/tools/utils/shlex/shlex.go b/tools/utils/shlex/shlex.go index 2e0d1a106..1aaf85fa3 100644 --- a/tools/utils/shlex/shlex.go +++ b/tools/utils/shlex/shlex.go @@ -12,419 +12,204 @@ To process a stream of strings: for ; token, err := l.Next(); err != nil { // process token } - -To access the raw token stream (which includes tokens for spaces): - - t := NewTokenizer(os.Stdin) - for ; token, err := t.Next(); err != nil { - // process token - } */ package shlex -// Based on https://pkg.go.dev/github.com/google/shlex with many improvements -// Relicensed to GPLv3 since all my additions.changes are GPLv3 which makes the -// original work with was APL2 also GPLv3 - import ( - "errors" "fmt" - "io" "strings" + "unicode/utf8" ) -// TokenType is a top-level token classification: A word, space, unknown. -type TokenType int - -// runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape. -type runeTokenClass int - -// the internal state used by the lexer state machine -type lexerState int - -// Token is a (type, value) pair representing a lexographical token. -type Token struct { - Type TokenType - Value string - Pos int64 +type Word struct { + Value string // The word is empty if EOF is reached + Pos int // The position in the input string of the word or the trailer + Err error // Indicates an error (unterminated string or trailing unescaped backslash) + Trailer string // Extra trailing data such as an unterminated string or an unescaped backslash. Present only if Err != nil } -// Named classes of UTF-8 runes -const ( - spaceRunes = " \t\r\n" - escapingQuoteRunes = `"` - nonEscapingQuoteRunes = "'" - escapeRunes = `\` -) - -// Classes of rune token -const ( - unknownRuneClass runeTokenClass = iota - spaceRuneClass - escapingQuoteRuneClass - nonEscapingQuoteRuneClass - escapeRuneClass - eofRuneClass -) - -// Classes of lexographic token -const ( - UnknownToken TokenType = iota - WordToken - SpaceToken -) - -func (t TokenType) String() string { - switch t { - default: - return "UnknownToken" - case WordToken: - return "WordToken" - case SpaceToken: - return "SpaceToken" - } -} +type lexer_state int // Lexer state machine states const ( - startState lexerState = iota // no runes have been seen - inWordState // processing regular runes in a word - inSpaceState // processing runes in a space - escapingState // we have just consumed an escape rune; the next rune is literal - escapingQuotedState // we have just consumed an escape rune within a quoted string - quotingEscapingState // we are within a quoted string that supports escaping ("...") - quotingState // we are within a string that does not support escaping ('...') + lex_normal lexer_state = iota + word + string_without_escapes + string_with_escapes ) -// tokenClassifier is used for classifying rune characters. -type tokenClassifier map[rune]runeTokenClass - -func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) { - for _, runeChar := range runes { - typeMap[runeChar] = tokenType - } -} - -// newDefaultClassifier creates a new classifier for ASCII characters. -func newDefaultClassifier() tokenClassifier { - t := tokenClassifier{} - t.addRuneClass(spaceRunes, spaceRuneClass) - t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass) - t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass) - t.addRuneClass(escapeRunes, escapeRuneClass) - return t -} - -// ClassifyRune classifiees a rune -func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass { - return t[runeVal] -} - // Lexer turns an input stream into a sequence of tokens. Whitespace is skipped. -type Lexer Tokenizer - -// NewLexer creates a new lexer from an input stream. -func NewLexer(x io.RuneReader) *Lexer { - - return (*Lexer)(NewTokenizer(x)) +type Lexer struct { + state lexer_state + src string + src_sz, src_pos, word_start int + buf strings.Builder } -// Next returns the next word, or an error. If there are no more words, -// the error will be io.EOF. -func (l *Lexer) Next() (string, error) { - for { - token, err := (*Tokenizer)(l).Next() - if err != nil { - return "", err - } - switch token.Type { - case WordToken: - return token.Value, nil - case SpaceToken: - // skip spaces - default: - return "", fmt.Errorf("Unknown token type: %s", token.Type) - } - } +// NewLexer creates a new lexer from an input string. +func NewLexer(x string) *Lexer { + return &Lexer{src: x, src_sz: len(x)} } -// Tokenizer turns an input stream into a sequence of typed tokens -type Tokenizer struct { - input io.RuneReader - classifier tokenClassifier - pos int64 - redo_rune struct { - char rune - sz int - rune_type runeTokenClass - } +func (self *Lexer) start_word() { + self.buf.Reset() + self.word_start = self.src_pos - 1 } -// NewTokenizer creates a new tokenizer from an input stream. -func NewTokenizer(input io.RuneReader) *Tokenizer { - classifier := newDefaultClassifier() - return &Tokenizer{ - input: input, - classifier: classifier} +func (self *Lexer) get_word() Word { + return Word{Pos: self.word_start, Value: self.buf.String()} } -var ErrTrailingEscape error = errors.New("EOF found after escape character") -var ErrTrailingQuoteEscape error = errors.New("EOF found after escape character for double quote") -var ErrUnclosedDoubleQuote error = errors.New("EOF found when expecting closing double quote") -var ErrUnclosedSingleQuote error = errors.New("EOF found when expecting closing single quote") +func (self *Lexer) write_ch(ch byte) { + self.buf.WriteByte(ch) +} -// scanStream scans the stream for the next token using the internal state machine. -// It will panic if it encounters a rune which it does not know how to handle. -func (t *Tokenizer) scanStream() (*Token, error) { - state := startState - var tokenType TokenType - var nextRune rune - var nextRuneType runeTokenClass - var err error - var sz int - value := strings.Builder{} - pos_at_start := t.pos - - unread_rune := func() { - t.redo_rune.sz = sz - t.redo_rune.char = nextRune - t.redo_rune.rune_type = nextRuneType - t.pos -= int64(sz) - } - - token := func() *Token { - return &Token{tokenType, value.String(), pos_at_start} - } - - for { - if t.redo_rune.sz > 0 { - nextRune, sz = t.redo_rune.char, t.redo_rune.sz - nextRuneType = t.redo_rune.rune_type - t.redo_rune.sz = 0 - } else { - nextRune, sz, err = t.input.ReadRune() - nextRuneType = t.classifier.ClassifyRune(nextRune) +func (self *Lexer) write_escaped_ch() bool { + ch, count := utf8.DecodeRuneInString(self.src[self.src_pos:]) + if count > 0 { + self.src_pos += count + if ch != utf8.RuneError { + self.buf.WriteRune(ch) } + return true + } + return false +} - if err == io.EOF { - nextRuneType = eofRuneClass - err = nil - } else if err != nil { - return nil, err - } - t.pos += int64(sz) - - switch state { - case startState: // no runes read yet - { - switch nextRuneType { - case eofRuneClass: - { - return nil, io.EOF - } - case spaceRuneClass: - { - tokenType = SpaceToken - value.WriteRune(nextRune) - state = inSpaceState - } - case escapingQuoteRuneClass: - { - tokenType = WordToken - state = quotingEscapingState - } - case nonEscapingQuoteRuneClass: - { - tokenType = WordToken - state = quotingState - } - case escapeRuneClass: - { - tokenType = WordToken - state = escapingState - } - default: - { - tokenType = WordToken - value.WriteRune(nextRune) - state = inWordState - } +// Next returns the next word. At EOF Word.Value will be "" +func (self *Lexer) Next() (ans Word) { + const string_with_escapes_delim = '"' + const string_without_escapes_delim = '\'' + const escape_char = '\\' + for self.src_pos < self.src_sz { + ch := self.src[self.src_pos] + self.src_pos++ + switch self.state { + case lex_normal: + switch ch { + case ' ', '\n', '\r', '\t': + case string_with_escapes_delim: + self.state = string_with_escapes + self.start_word() + case string_without_escapes_delim: + self.state = string_without_escapes + self.start_word() + case escape_char: + self.start_word() + if !self.write_escaped_ch() { + ans.Trailer = "\\" + ans.Err = fmt.Errorf("Extra backslash at end of input") + ans.Pos = self.word_start + return } + self.state = word + default: + self.state = word + self.start_word() + self.write_ch(ch) } - case inSpaceState: // in a sequence of spaces separating words - { - switch nextRuneType { - case spaceRuneClass: - { - value.WriteRune(nextRune) - } - default: - { - unread_rune() - return token(), err - } + case word: + switch ch { + case ' ', '\n', '\r', '\t': + self.state = lex_normal + if self.buf.Len() > 0 { + return self.get_word() } - } - case inWordState: // in a regular word - { - switch nextRuneType { - case eofRuneClass: - { - return token(), err - } - case spaceRuneClass: - { - unread_rune() - return token(), err - } - case escapingQuoteRuneClass: - { - state = quotingEscapingState - } - case nonEscapingQuoteRuneClass: - { - state = quotingState - } - case escapeRuneClass: - { - state = escapingState - } - default: - { - value.WriteRune(nextRune) - } + case string_with_escapes_delim: + self.state = string_with_escapes + case string_without_escapes_delim: + self.state = string_without_escapes + case escape_char: + if !self.write_escaped_ch() { + ans.Pos = self.word_start + ans.Trailer = self.buf.String() + "\\" + ans.Err = fmt.Errorf("Extra backslash at end of input") + return } + default: + self.write_ch(ch) } - case escapingState: // the rune after an escape character - { - switch nextRuneType { - case eofRuneClass: - { - err = ErrTrailingEscape - return token(), err - } - default: - { - state = inWordState - value.WriteRune(nextRune) - } - } + case string_without_escapes: + switch ch { + case string_without_escapes_delim: + self.state = word + default: + self.write_ch(ch) } - case escapingQuotedState: // the next rune after an escape character, in double quotes - { - switch nextRuneType { - case eofRuneClass: - { - err = ErrTrailingQuoteEscape - return token(), err - } - default: - { - state = quotingEscapingState - value.WriteRune(nextRune) - } - } - } - case quotingEscapingState: // in escaping double quotes - { - switch nextRuneType { - case eofRuneClass: - { - err = ErrUnclosedDoubleQuote - return token(), err - } - case escapingQuoteRuneClass: - { - state = inWordState - } - case escapeRuneClass: - { - state = escapingQuotedState - } - default: - { - value.WriteRune(nextRune) - } - } - } - case quotingState: // in non-escaping single quotes - { - switch nextRuneType { - case eofRuneClass: - { - err = ErrUnclosedSingleQuote - return token(), err - } - case nonEscapingQuoteRuneClass: - { - state = inWordState - } - default: - { - value.WriteRune(nextRune) - } - } - } - default: - { - return nil, fmt.Errorf("Unexpected state: %v", state) + case string_with_escapes: + switch ch { + case string_with_escapes_delim: + self.state = word + case escape_char: + self.write_escaped_ch() + default: + self.write_ch(ch) } } } -} + switch self.state { + case word: + self.state = lex_normal + if self.buf.Len() > 0 { + return self.get_word() + } + case string_with_escapes, string_without_escapes: + self.state = lex_normal + ans.Trailer = self.buf.String() + ans.Pos = self.word_start + ans.Err = fmt.Errorf("Unterminated string at end of input") + return + case lex_normal: -// Next returns the next token in the stream. -func (t *Tokenizer) Next() (*Token, error) { - return t.scanStream() -} - -// Pos returns the current position in the string as a byte offset -func (t *Tokenizer) Pos() int64 { - return t.pos + } + return } // Split partitions a string into a slice of strings. -func Split(s string) ([]string, error) { - l := NewLexer(strings.NewReader(s)) - subStrings := make([]string, 0) +func Split(s string) (ans []string, err error) { + l := NewLexer(s) + var word Word for { - word, err := l.Next() - if err != nil { - if err == io.EOF { - return subStrings, nil - } - return subStrings, err + word = l.Next() + if word.Err != nil { + return ans, word.Err } - subStrings = append(subStrings, word) + if word.Value == "" { + break + } + ans = append(ans, word.Value) } + return } // SplitForCompletion partitions a string into a slice of strings. It differs from Split in being -// more relaxed about errors and also adding an empty string at the end if s ends with a SpaceToken. +// more relaxed about errors and also adding an empty string at the end if s ends with a Space. func SplitForCompletion(s string) (argv []string, position_of_last_arg int) { - t := NewTokenizer(strings.NewReader(s)) + t := NewLexer(s) argv = make([]string, 0, len(s)/4) - token := &Token{} for { - ntoken, err := t.Next() - if err == io.EOF { - if token.Type == SpaceToken { - argv = append(argv, "") - token.Pos += int64(len(token.Value)) + word := t.Next() + if word.Value == "" { + if word.Trailer == "" { + trimmed := strings.TrimRight(s, " ") + if len(trimmed) < len(s) { // trailing spaces + pos := position_of_last_arg + if len(argv) > 0 { + pos += len(argv[len(argv)-1]) + } + if pos < len(s) { // trailing whitespace + argv = append(argv, "") + position_of_last_arg += len(s) - pos + 1 + } + } + } else { + argv = append(argv, word.Trailer) + position_of_last_arg = word.Pos } - return argv, int(token.Pos) + break } - if ntoken == nil { - return []string{}, -1 - } - switch ntoken.Type { - case WordToken: - argv = append(argv, ntoken.Value) - case SpaceToken: - // skip spaces - default: - return []string{}, -1 - } - token = ntoken + position_of_last_arg = word.Pos + argv = append(argv, word.Value) } + return } diff --git a/tools/utils/shlex/shlex_test.go b/tools/utils/shlex/shlex_test.go index db5d348f8..f6be690eb 100644 --- a/tools/utils/shlex/shlex_test.go +++ b/tools/utils/shlex/shlex_test.go @@ -1,7 +1,6 @@ package shlex import ( - "strings" "testing" "github.com/google/go-cmp/cmp" @@ -13,78 +12,24 @@ var ( testString = "one two \"three four\" \"five \\\"six\\\"\" seven#eight # nine # ten eleven 'twelve\\' thirteen=13 fourteen/14" ) -func TestClassifier(t *testing.T) { - classifier := newDefaultClassifier() - tests := map[rune]runeTokenClass{ - ' ': spaceRuneClass, - '"': escapingQuoteRuneClass, - '\'': nonEscapingQuoteRuneClass} - for runeChar, want := range tests { - got := classifier.ClassifyRune(runeChar) - if got != want { - t.Errorf("ClassifyRune(%v) -> %v. Want: %v", runeChar, got, want) - } - } -} - -func TestTokenizer(t *testing.T) { - testInput := testString - expectedTokens := []*Token{ - {WordToken, "one", 0}, - {SpaceToken, " ", 3}, - {WordToken, "two", 4}, - {SpaceToken, " ", 7}, - {WordToken, "three four", 8}, - {SpaceToken, " ", 20}, - {WordToken, "five \"six\"", 21}, - {SpaceToken, " ", 35}, - {WordToken, "seven#eight", 36}, - {SpaceToken, " ", 47}, - {WordToken, "#", 48}, - {SpaceToken, " ", 49}, - {WordToken, "nine", 50}, - {SpaceToken, " ", 54}, - {WordToken, "#", 55}, - {SpaceToken, " ", 56}, - {WordToken, "ten", 57}, - {SpaceToken, " ", 60}, - {WordToken, "eleven", 61}, - {SpaceToken, " ", 67}, - {WordToken, "twelve\\", 68}, - {SpaceToken, " ", 77}, - {WordToken, "thirteen=13", 78}, - {SpaceToken, " ", 89}, - {WordToken, "fourteen/14", 90}, - } - - tokenizer := NewTokenizer(strings.NewReader(testInput)) - for i, want := range expectedTokens { - got, err := tokenizer.Next() - if err != nil { - t.Error(err) - } - if diff := cmp.Diff(want, got); diff != "" { - t.Fatalf("Tokenizer.Next()[%v] of: %s:\n%s", i, testString, diff) - } - } -} - func TestLexer(t *testing.T) { testInput := testString expectedStrings := []string{"one", "two", "three four", "five \"six\"", "seven#eight", "#", "nine", "#", "ten", "eleven", "twelve\\", "thirteen=13", "fourteen/14"} - lexer := NewLexer(strings.NewReader(testInput)) + lexer := NewLexer(testInput) for i, want := range expectedStrings { - got, err := lexer.Next() - if err != nil { - t.Error(err) - } - if got != want { + got := lexer.Next() + if got.Value != want { t.Errorf("Lexer.Next()[%v] of %q -> %v. Want: %v", i, testString, got, want) } } } +type Tok struct { + Pos int + Val string +} + func TestSplit(t *testing.T) { want := []string{"one", "two", "three four", "five \"six\"", "seven#eight", "#", "nine", "#", "ten", "eleven", "twelve\\", "thirteen=13", "fourteen/14"} got, err := Split(testString) @@ -99,6 +44,43 @@ func TestSplit(t *testing.T) { t.Errorf("Split(%q)[%v] -> %v. Want: %v", testString, i, got[i], want[i]) } } + + for _, x := range []string{ + `abc\`, `\`, `'abc`, `'`, `"`, `asd\`, + } { + _, err := Split(x) + if err == nil { + t.Fatalf("Failed to get an error for: %#v", x) + } + } + s := func(q string) (ans []Tok) { + l := NewLexer(q) + for { + w := l.Next() + if w.Err != nil { + t.Fatal(w.Err) + } + if w.Value == "" { + break + } + ans = append(ans, Tok{w.Pos, w.Value}) + } + return + } + for q, expected := range map[string][]Tok{ + `"ab"`: {{0, "ab"}}, + `x "ab"y \m`: {{0, `x`}, {2, `aby`}, {8, `m`}}, + `x'y"\z'1`: {{0, `xy"\z1`}}, + `\abc\ d`: {{0, `abc d`}}, + ``: nil, + ` `: nil, + " \tabc\n\t\r ": {{2, "abc"}}, + } { + if diff := cmp.Diff(expected, s(q)); diff != "" { + t.Fatalf("Failed for string: %#v\n%s", q, diff) + } + } + } func TestSplitForCompletion(t *testing.T) { @@ -108,7 +90,7 @@ func TestSplitForCompletion(t *testing.T) { t.Fatalf("Failed to split: %s\n%s", cmdline, diff) } if last_arg_pos != actual_pos { - t.Fatalf("Failed to split: %s\n Last arg pos: %d != %d", cmdline, last_arg_pos, actual_pos) + t.Fatalf("Failed to split: %#v\n Last arg pos: %d != %d", cmdline, last_arg_pos, actual_pos) } } test("a b", 2, "a", "b")