Files
one/lexer.v

240 lines
4.7 KiB
V

module main
import term
enum TokenType as u8 {
kw_let
kw_const
kw_immutable
type
kw_if
kw_else
kw_elif
kw_while
kw_break
kw_fn
kw_return
kw_print
kw_class
kw_ref
kw_deref
integer
real
boolean
string
identifier
plus
minus
star
slash
percent
and
or
and_and
or_or
equals
less
greater
eq_eq
greater_eq
less_eq
not_eq
plus_eq
minus_eq
star_eq
slash_eq
percent_eq
and_eq
or_eq
increment
decrement
lparen
rparen
lsqparen
rsqparen
lbracket
rbracket
dot
comma
semicolon
colon
eof
unknown
}
struct Token {
type TokenType
text string
}
fn str_from_toktype(type TokenType) string {
return type.str()
}
fn toktype_from_delimiter(delimiter string) TokenType {
return match delimiter {
'(' {.lparen}
')' {.rparen}
'[' {.lsqparen}
']' {.rsqparen}
'{' {.lbracket}
'}' {.rbracket}
'+' {.plus}
'-' {.minus}
'*' {.star}
'/' {.slash}
'%' {.percent}
'=' {.equals}
'<' {.less}
'>' {.greater}
'.' {.dot}
',' {.comma}
';' {.semicolon}
':' {.colon}
'==' {.eq_eq}
'>=' {.greater_eq}
'<=' {.less_eq}
'!=' {.not_eq}
'+=' {.plus_eq}
'-=' {.minus_eq}
'*=' {.star_eq}
'/=' {.slash_eq}
'%=' {.percent_eq}
'++' {.increment}
'--' {.decrement}
'&' {.and}
'&&' {.and_and}
'|' {.or}
'||' {.or_or}
'&=' {.and_eq}
'|=' {.or_eq}
else {.unknown}
}
}
fn toktype_from_kw(kw string) TokenType {
return match kw {
'let' {.kw_let}
'const' {.kw_const}
'void', 'real', 'bool', 'int', 'string'{.type}
'if' {.kw_if}
'else' {.kw_else}
'elif' {.kw_elif}
'while' {.kw_while}
'break' {.kw_break}
'fn' {.kw_fn}
'return' {.kw_return}
'true', 'false' {.boolean}
'print' {.kw_print}
'class' {.kw_class}
'immutable' {.kw_immutable}
'ref' {.kw_ref}
'deref' {.kw_deref}
else {.unknown}
}
}
fn is_delimiter(c u8, is_inside_number bool) bool {
valid_chars := match is_inside_number {
true {" #+-*/,;:%<>()[]{}=|&!\n\""}
false {". #+-*/,;:%<>()[]{}=|&!\n\""}
}
return valid_chars.contains(c.ascii_str())
}
fn is_real(str string) bool {
left, right := str.split_once(".") or {return false}
return !right.contains(".") && left.is_int() && right.is_int()
}
fn is_keyword(str string) bool {
return [
"void", "int", "real", "bool", "string", "if", "else", "elif", "while", "break", "fn", "return", "let", "const", "true", "false", "print", "class", "immutable", "ref", "deref"
].contains(str)
}
fn print_tok(tok Token) {
println("${tok.text.replace("\n", "\\n"):8} (${str_from_toktype(tok.type)})")
}
fn print_toks(toks []Token) {
for tok in toks {
print_tok(tok)
}
}
fn lex(input string) ?[]Token {
mut left := 0
mut right := 0
mut line := 1
mut tokens := []Token{}
mut is_inside_number := false
mut is_inside_string := false
mut is_inside_comment := false
for (right < input.len && left <= right) {
for is_inside_comment {
right++
if ['#', '\n'].contains(input[right].ascii_str()) {
is_inside_comment = false
right++
}
left = right
}
for is_inside_string {
right++
if input[right].ascii_str() == '\"' {
is_inside_string = false
right++
tokens << Token{.string, input.substr(left+1, right-1)}
left = right
}
}
is_inside_number = input[left].ascii_str().is_int()
if input[right] == `\n` {
line++
}
if !is_delimiter(input[right], is_inside_number) {
right++
}
if right >= input.len {
break
}
if is_delimiter(input[right], is_inside_number) && left == right {
if !input[right].is_space() {
if input[right].ascii_str() == '\"' {is_inside_string = true; continue}
if input[right].ascii_str() == '#' {is_inside_comment = true; continue}
mut tok_str := input[right].ascii_str()
if right + 1 < input.len {
combined := input.substr(right, right + 2)
if combined in ['==', '>=', '<=', '!=', '+=', '-=', '*=', '/=', '%=', '++', '--', '||', '&&', '&=', '|='] {
tok_str = combined
right++
}
}
tokens << Token{toktype_from_delimiter(tok_str), tok_str}
}
right++
left = right
}
else if (is_delimiter(input[right], is_inside_number) && left != right) || (right == input.len && left != right) {
subs := input.substr(left, right)
if is_keyword(subs) {
tokens << Token{toktype_from_kw(subs), subs}
} else if subs.is_int() {
tokens << Token{TokenType.integer, subs}
} else if is_real(subs) {
tokens << Token{TokenType.real, subs}
} else if subs.is_identifier() {
tokens << Token{TokenType.identifier, subs}
} else if !subs.is_identifier() && !is_delimiter(input[right-1], is_inside_number) {
eprintln(term.red("ERROR: found invalid token " + subs + " at line " + line.str()))
return none
}
left = right
}
}
tokens << Token{TokenType.eof, "EOF"}
return tokens
}