more token types

This commit is contained in:
uan
2026-02-02 19:34:10 +01:00
parent 38504582ae
commit fbae7bc5d5
4 changed files with 99 additions and 24 deletions

107
lexer.v
View File

@@ -1,11 +1,30 @@
module main
import term
enum TokenType as u8 {
integer
real
operator
keyword
identifier
plus
minus
star
slash
equals
less
greater
lparen
rparen
lsqparen
rsqparen
lbracket
rbracket
dot
comma
semicolon
colon
newline
eof
unknown
}
@@ -15,47 +34,95 @@ struct Token {
text string
}
fn str_from_toktype(type TokenType) string {
return match type {
.integer {'integer'}
.real {'real'}
.operator {'operator'}
.keyword {'keyword'}
.identifier {'identifier'}
.eof {'EOF'}
.unknown {'unknown'}
.plus {'plus'}
.minus {'minus'}
.star {'star'}
.slash {'slash'}
.equals {'equals'}
.less {'less'}
.greater {'greater'}
.lparen {'lparen'}
.rparen {'rparen'}
.lsqparen {'lsqparen'}
.rsqparen {'rsqparen'}
.lbracket {'lbracket'}
.rbracket {'rbracket'}
.dot {'dot'}
.comma {'comma'}
.semicolon {'semicolon'}
.colon {'colon'}
.newline {'newline'}
}
}
fn toktype_from_delimiter(delimiter string) TokenType {
return match delimiter {
'(' {.lparen}
')' {.rparen}
'[' {.lsqparen}
']' {.rsqparen}
'{' {.lbracket}
'}' {.rbracket}
'+' {.plus}
'-' {.minus}
'*' {.star}
'/' {.slash}
'=' {.equals}
'<' {.less}
'>' {.greater}
'.' {.dot}
',' {.comma}
';' {.semicolon}
':' {.colon}
'\n' {.newline}
else {.unknown}
}
}
fn is_delimiter(c u8) bool {
return " +-*/,;%<>()[]{}=\n".contains(c.ascii_str())
}
fn is_operator(c u8) bool {
return "+-*/=".contains(c.ascii_str())
return " +-*/.,;:%<>()[]{}=\n".contains(c.ascii_str())
}
fn is_real(str string) bool {
left, right := str.split_once(".") or {return false}
return !right.contains(".") && left.is_int() && right.is_int()
}
fn is_keyword(str string) bool {
return [
"void", "int", "real", "if", "else", "while", "break", "fn", "return"
"void", "int", "real", "bool", "if", "else", "for", "break", "fn", "return"
].contains(str)
}
fn print_tok(tok Token) {
println("${tok.text:8} (${str_from_toktype(tok.type)})")
println("${tok.text.replace("\n", "\\n"):8} (${str_from_toktype(tok.type)})")
}
fn lex(input string) {
fn print_toks(toks []Token) {
for tok in toks {
print_tok(tok)
}
}
fn lex(input string) ?[]Token {
mut left := 0
mut right := 0
mut line := 1
mut tokens := []Token{}
for (right < input.len && left <= right) {
if input[right] == `\n` {
line++
}
if !is_delimiter(input[right]) {
right++
}
@@ -63,8 +130,8 @@ fn lex(input string) {
break
}
if is_delimiter(input[right]) && left == right {
if is_operator(input[right]) {
print_tok(Token{TokenType.operator, input[right].ascii_str()})
if input[right] != ` ` {
tokens << Token{toktype_from_delimiter(input[right].ascii_str()), input[right].ascii_str()}
}
right++
left = right
@@ -72,18 +139,20 @@ fn lex(input string) {
else if (is_delimiter(input[right]) && left != right) || (right == input.len && left != right) {
subs := input.substr(left, right)
if is_keyword(subs) {
print_tok(Token{TokenType.keyword, subs})
tokens << Token{TokenType.keyword, subs}
} else if subs.is_int() {
print_tok(Token{TokenType.integer, subs})
tokens << Token{TokenType.integer, subs}
} else if is_real(subs) {
print_tok(Token{TokenType.real, subs})
tokens << Token{TokenType.real, subs}
} else if subs.is_identifier() {
print_tok(Token{TokenType.identifier, subs})
tokens << Token{TokenType.identifier, subs}
} else if !subs.is_identifier() && !is_delimiter(input[right-1]) {
print_tok(Token{TokenType.unknown, subs})
eprintln(term.red("ERROR: found invalid token " + subs + " at line " + line.str()))
return none
}
left = right
}
}
print_tok(Token{TokenType.eof, "EOF"})
tokens << Token{TokenType.eof, "EOF"}
return tokens
}