Skip to content

Instantly share code, notes, and snippets.

@shar0
Created January 12, 2026 12:36
Show Gist options
  • Select an option

  • Save shar0/d7b9b1ed7e768e5f3c479bd7ef3def7b to your computer and use it in GitHub Desktop.

Select an option

Save shar0/d7b9b1ed7e768e5f3c479bd7ef3def7b to your computer and use it in GitHub Desktop.
Basic Tokenizer
using System;
using System.Collections.Generic;
using System.Text;
namespace BasicTokenizer
{
// 1. 定义 Token 的类型
public enum TokenType
{
EOF, // 文件结束
Integer, // 整数 (例如: 10, 20)
String, // 字符串 (例如: "Hello")
Identifier, // 标识符/变量名 (例如: A, count)
Keyword, // 关键字 (例如: PRINT, IF, GOTO)
Plus, // +
Minus, // -
Multiply, // *
Divide, // /
Equals, // =
LParen, // (
RParen, // )
Unknown // 未知字符
}
// 2. 定义 Token 类
public class Token
{
public TokenType Type { get; }
public string Value { get; }
public Token(TokenType type, string value)
{
Type = type;
Value = value;
}
public override string ToString()
{
return $"Token({Type}, \"{Value}\")";
}
}
// 3. 核心 Tokenizer 类
public class Lexer
{
private readonly string _text;
private int _pos;
// BASIC 常见关键字列表 (通常不区分大小写,这里统一处理)
private static readonly HashSet<string> Keywords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"PRINT", "IF", "THEN", "ELSE", "FOR", "TO", "NEXT", "GOTO", "END", "LET", "INPUT"
};
public Lexer(string text)
{
_text = text;
_pos = 0;
}
private char CurrentChar => _pos < _text.Length ? _text[_pos] : '\0';
private void Advance()
{
_pos++;
}
// 获取下一个 Token
public Token GetNextToken()
{
while (CurrentChar != '\0')
{
// 跳过空白字符
if (char.IsWhiteSpace(CurrentChar))
{
Advance();
continue;
}
// 处理数字 (简单的整数)
if (char.IsDigit(CurrentChar))
{
return new Token(TokenType.Integer, ParseNumber());
}
// 处理标识符或关键字
if (char.IsLetter(CurrentChar))
{
return ParseIdentifierOrKeyword();
}
// 处理字符串字面量
if (CurrentChar == '"')
{
return ParseString();
}
// 处理单字符符号
switch (CurrentChar)
{
case '+': Advance(); return new Token(TokenType.Plus, "+");
case '-': Advance(); return new Token(TokenType.Minus, "-");
case '*': Advance(); return new Token(TokenType.Multiply, "*");
case '/': Advance(); return new Token(TokenType.Divide, "/");
case '=': Advance(); return new Token(TokenType.Equals, "=");
case '(': Advance(); return new Token(TokenType.LParen, "(");
case ')': Advance(); return new Token(TokenType.RParen, ")");
default:
string unknown = CurrentChar.ToString();
Advance();
return new Token(TokenType.Unknown, unknown);
}
}
return new Token(TokenType.EOF, null);
}
// 解析连续的数字
private string ParseNumber()
{
StringBuilder sb = new StringBuilder();
while (CurrentChar != '\0' && char.IsDigit(CurrentChar))
{
sb.Append(CurrentChar);
Advance();
}
return sb.ToString();
}
// 解析标识符,并检查是否为关键字
private Token ParseIdentifierOrKeyword()
{
StringBuilder sb = new StringBuilder();
while (CurrentChar != '\0' && (char.IsLetterOrDigit(CurrentChar) || CurrentChar == '_'))
{
sb.Append(CurrentChar);
Advance();
}
string text = sb.ToString();
// 检查是否是关键字
if (Keywords.Contains(text))
{
return new Token(TokenType.Keyword, text.ToUpper()); // 统一转大写
}
return new Token(TokenType.Identifier, text);
}
// 解析双引号包裹的字符串
private Token ParseString()
{
Advance(); // 跳过开头的 "
StringBuilder sb = new StringBuilder();
while (CurrentChar != '\0' && CurrentChar != '"')
{
sb.Append(CurrentChar);
Advance();
}
Advance(); // 跳过结尾的 "
return new Token(TokenType.String, sb.ToString());
}
}
// 4. 测试程序
class Program
{
static void Main(string[] args)
{
// 一段简单的 BASIC 代码
string code = @"
10 LET A = 5
20 PRINT ""Result is""
30 IF A = 5 THEN GOTO 50
40 END
50 PRINT A + 10
";
Console.WriteLine($"Source Code:\n{code}\n");
Console.WriteLine("Tokens:");
Console.WriteLine("--------------------------------");
Lexer lexer = new Lexer(code);
Token token = lexer.GetNextToken();
while (token.Type != TokenType.EOF)
{
Console.WriteLine(token);
token = lexer.GetNextToken();
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment