Skip to content

Commit

Permalink
Update documentation to match the current code
Browse files Browse the repository at this point in the history
readme.md:
- Update examples, fixing some arguments, and stop using deprecated
  methods.
- Add blank lines between successive codeblocks, this is usually what is
  expected by markdown parsers.
- Fix a typo in method name.
- Explain that numbers and underscores are not part of keywords by
  default.
- Remove some whitespaces at the end of lines.

tokenizer.go
- Add blank lines before "Deprecated:" doc comment to allow go tools to
  mark them as deprecated in the docs and in editors.
- Fix the description of DefineStringToken().
  • Loading branch information
n-peugnet authored and bzick committed Jan 15, 2025
1 parent c98a149 commit adf0ff4
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 26 deletions.
59 changes: 35 additions & 24 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,33 @@ Use cases:
For example, parsing SQL `WHERE` condition `user_id = 119 and modified > "2020-01-01 00:00:00" or amount >= 122.34`:

```go
import "github.com/bzick/tokenizer"

// define custom tokens keys
const (
TEquality = 1
TDot = 2
TMath = 3
const (
TEquality = iota + 1
TDot
TMath
TDoubleQuoted
)

// configure tokenizer
parser := tokenizer.New()
parser.DefineTokens(TEquality, []string{"<", "<=", "==", ">=", ">", "!="})
parser.DefineTokens(TDot, []string{"."})
parser.DefineTokens(TMath, []string{"+", "-", "/", "*", "%"})
parser.DefineStringToken(`"`, `"`).SetEscapeSymbol(tokenizer.BackSlash)
parser.DefineStringToken(TDoubleQuoted, `"`, `"`).SetEscapeSymbol(tokenizer.BackSlash)
parser.AllowKeywordSymbols(tokenizer.Underscore, tokenizer.Numbers)

// create tokens' stream
stream := parser.ParseString(`user_id = 119 and modified > "2020-01-01 00:00:00" or amount >= 122.34`)
defer stream.Close()

// iterate over each token
for stream.Valid() {
for stream.IsValid() {
if stream.CurrentToken().Is(tokenizer.TokenKeyword) {
field := stream.NextToken().ValueString()
// ...
field := stream.CurrentToken().ValueString()
// ...
}
stream.GoNext()
}
Expand All @@ -68,15 +72,15 @@ tokens: |user_id| =| 119| and| modified| >| "2020-01-01 00:00:00"| or| amount| >
0: {key: TokenKeyword, value: "user_id"} token.Value() == "user_id"
1: {key: TEquality, value: "="} token.Value() == "="
2: {key: TokenInteger, value: "119"} token.ValueInt() == 119
2: {key: TokenInteger, value: "119"} token.ValueInt64() == 119
3: {key: TokenKeyword, value: "and"} token.Value() == "and"
4: {key: TokenKeyword, value: "modified"} token.Value() == "modified"
5: {key: TEquality, value: ">"} token.Value() == ">"
6: {key: TokenString, value: "\"2020-01-01 00:00:00\""} token.ValueUnescaped() == "2020-01-01 00:00:00"
7: {key: TokenKeyword, value: "or"} token.Value() == "and"
8: {key: TokenKeyword, value: "amount"} token.Value() == "amount"
9: {key: TEquality, value: ">="} token.Value() == ">="
10: {key: TokenFloat, value: "122.34"} token.ValueFloat() == 122.34
10: {key: TokenFloat, value: "122.34"} token.ValueFloat64() == 122.34
```

More examples:
Expand All @@ -87,12 +91,11 @@ More examples:
### Create and parse

```go
import (
"github.com/bzick/tokenizer"
)
import "github.com/bzick/tokenizer"

var parser := tokenizer.New()
parser.AllowKeywordUnderscore() // ... and other configuration code
parser.AllowKeywordSymbols(tokenizer.Underscore, []rune{})
// ... and other configuration code

```

Expand All @@ -110,20 +113,20 @@ fp, err := os.Open("data.json") // huge JSON file

stream := parser.ParseStream(fp, 4096).SetHistorySize(10)
defer stream.Close()
for stream.IsValid() {
for stream.IsValid() {
// ...
stream.GoNext()
}
```

## Embedded tokens

- `tokenizer.TokenUnknown` — unspecified token key.
- `tokenizer.TokenUnknown` — unspecified token key.
- `tokenizer.TokenKeyword` — keyword, any combination of letters, including unicode letters.
- `tokenizer.TokenInteger` — integer value
- `tokenizer.TokenFloat` — float/double value
- `tokenizer.TokenString` — quoted string
- `tokenizer.TokenStringFragment` — fragment framed (quoted) string
- `tokenizer.TokenStringFragment` — fragment framed (quoted) string

### Unknown token

Expand All @@ -132,6 +135,7 @@ A token marks as `tokenizer.TokenUnknown` if the parser detects an unknown token
```go
stream := parser.ParseString(`one!`)
```

```
stream: [
{
Expand All @@ -151,6 +155,7 @@ Setting `tokenizer.StopOnUndefinedToken()` stops parser when `tokenizer.TokenUn
```go
stream := parser.ParseString(`one!`)
```

```
stream: [
{
Expand All @@ -168,11 +173,12 @@ and the length of the original string.

Any word that is not a custom token is stored in a single token as `tokenizer.TokenKeyword`.

The word can contain unicode characters, numbers (see `tokenizer.AllowNumbersInKeyword()`) and underscore (see `tokenizer.AllowKeywordUnderscore ()`).
The word can contain unicode characters, and it can be configured to contain other characters, like numbers and underscores (see `tokenizer.AllowKeywordSymbols()`).

```go
stream := parser.ParseString(`one 二 три`)
```

```
stream: [
{
Expand Down Expand Up @@ -210,6 +216,7 @@ Any integer is stored as one token with key `tokenizer.TokenInteger`.
```go
stream := parser.ParseString(`223 999`)
```

```
stream: [
{
Expand All @@ -223,11 +230,11 @@ stream: [
]
```

To get int64 from the token value use `stream.GetInt()`:
To get int64 from the token value use `stream.GetInt64()`:

```go
stream := tokenizer.ParseString("123")
fmt.Print("Token is %d", stream.CurrentToken().GetInt()) // Token is 123
fmt.Print("Token is %d", stream.CurrentToken().GetInt64()) // Token is 123
```

### Float number
Expand All @@ -241,6 +248,7 @@ Any float number is stored as one token with key `tokenizer.TokenFloat`. Float n
```go
stream := parser.ParseString(`1.3e-8`):
```

```
stream: [
{
Expand All @@ -250,11 +258,11 @@ stream: [
]
```

To get float64 from the token value use `token.GetFloat()`:
To get float64 from the token value use `token.GetFloat64()`:

```go
stream := parser.ParseString("1.3e2")
fmt.Print("Token is %d", stream.CurrentToken().GetFloat()) // Token is 130
fmt.Print("Token is %d", stream.CurrentToken().GetFloat64()) // Token is 130
```

### Framed string
Expand All @@ -271,6 +279,7 @@ parser.DefineStringToken(TokenDoubleQuotedString, `"`, `"`).SetEscapeSymbol('\\'
// ...
stream := parser.ParseString(`"two \"three"`)
```

```
stream: [
{
Expand All @@ -280,10 +289,10 @@ stream: [
]
```

To get a framed string without edge tokens and special characters, use the `stream.ValueUnescape()` method:
To get a framed string without edge tokens and special characters, use the `stream.ValueUnescaped()` method:

```go
value := stream.CurrentToken().ValueUnescape() // result: two "three
value := stream.CurrentToken().ValueUnescaped() // result: two "three
```

The method `token.StringKey()` will be return token string key defined in the `DefineStringToken`:
Expand Down Expand Up @@ -313,7 +322,9 @@ parser.DefineStringToken(TokenQuotedString, `"`, `"`).AddInjection(TokenOpenInje

parser.ParseString(`"one {{ two }} three"`)
```

Tokens:

```
{
{
Expand Down
8 changes: 6 additions & 2 deletions tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const BackSlash = '\\'
var DefaultWhiteSpaces = []byte{' ', '\t', '\n', '\r'}

// DefaultStringEscapes is default escaped symbols. Those symbols are often used everywhere.
//
// Deprecated: use DefaultSpecialString and AddSpecialStrings
var DefaultStringEscapes = map[byte]byte{
'n': '\n',
Expand Down Expand Up @@ -102,6 +103,7 @@ func (q *StringSettings) SetEscapeSymbol(symbol byte) *StringSettings {
}

// SetSpecialSymbols set mapping of all escapable symbols for escape symbol, like \n, \t, \r.
//
// Deprecated: use AddSpecialStrings
func (q *StringSettings) SetSpecialSymbols(special map[byte]byte) *StringSettings {
for _, v := range special {
Expand Down Expand Up @@ -171,6 +173,7 @@ func (t *Tokenizer) AllowKeywordSymbols(majorSymbols []rune, minorSymbols []rune
}

// AllowKeywordUnderscore allows underscore symbol in keywords, like `one_two` or `_three`
//
// Deprecated: use AllowKeywordSymbols
func (t *Tokenizer) AllowKeywordUnderscore() *Tokenizer {
t.kwMajorSymbols = append(t.kwMajorSymbols, '_')
Expand All @@ -180,6 +183,7 @@ func (t *Tokenizer) AllowKeywordUnderscore() *Tokenizer {
// AllowNumbersInKeyword allows numbers in keywords, like `one1` or `r2d2`
// The method allows numbers in keywords, but the keyword itself must not start with a number.
// There should be no spaces between letters and numbers.
//
// Deprecated: use AllowKeywordSymbols
func (t *Tokenizer) AllowNumbersInKeyword() *Tokenizer {
t.kwMinorSymbols = append(t.kwMinorSymbols, Numbers...)
Expand Down Expand Up @@ -230,10 +234,10 @@ func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer {
// For example, a piece of data surrounded by quotes: "string in quotes" or 'string on single quotes'.
// Arguments startToken and endToken defines open and close "quotes".
//
// - `t.DefineStringToken("`", "`")` - parse string "one `two three`" will be parsed as
// - `t.DefineStringToken(10, "`", "`")` - parse string "one `two three`" will be parsed as
// [{key: TokenKeyword, value: "one"}, {key: TokenString, value: "`two three`"}]
//
// - `t.DefineStringToken("//", "\n")` - parse string "parse // like comment\n" will be parsed as
// - `t.DefineStringToken(11, "//", "\n")` - parse string "parse // like comment\n" will be parsed as
// [{key: TokenKeyword, value: "parse"}, {key: TokenString, value: "// like comment"}]
func (t *Tokenizer) DefineStringToken(key TokenKey, startToken, endToken string) *StringSettings {
q := &StringSettings{
Expand Down

0 comments on commit adf0ff4

Please sign in to comment.