Created
January 12, 2026 12:36
-
-
Save shar0/d7b9b1ed7e768e5f3c479bd7ef3def7b to your computer and use it in GitHub Desktop.
Basic Tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using System; | |
| using System.Collections.Generic; | |
| using System.Text; | |
| namespace BasicTokenizer | |
| { | |
| // 1. 定义 Token 的类型 | |
| public enum TokenType | |
| { | |
| EOF, // 文件结束 | |
| Integer, // 整数 (例如: 10, 20) | |
| String, // 字符串 (例如: "Hello") | |
| Identifier, // 标识符/变量名 (例如: A, count) | |
| Keyword, // 关键字 (例如: PRINT, IF, GOTO) | |
| Plus, // + | |
| Minus, // - | |
| Multiply, // * | |
| Divide, // / | |
| Equals, // = | |
| LParen, // ( | |
| RParen, // ) | |
| Unknown // 未知字符 | |
| } | |
| // 2. 定义 Token 类 | |
| public class Token | |
| { | |
| public TokenType Type { get; } | |
| public string Value { get; } | |
| public Token(TokenType type, string value) | |
| { | |
| Type = type; | |
| Value = value; | |
| } | |
| public override string ToString() | |
| { | |
| return $"Token({Type}, \"{Value}\")"; | |
| } | |
| } | |
| // 3. 核心 Tokenizer 类 | |
| public class Lexer | |
| { | |
| private readonly string _text; | |
| private int _pos; | |
| // BASIC 常见关键字列表 (通常不区分大小写,这里统一处理) | |
| private static readonly HashSet<string> Keywords = new HashSet<string>(StringComparer.OrdinalIgnoreCase) | |
| { | |
| "PRINT", "IF", "THEN", "ELSE", "FOR", "TO", "NEXT", "GOTO", "END", "LET", "INPUT" | |
| }; | |
| public Lexer(string text) | |
| { | |
| _text = text; | |
| _pos = 0; | |
| } | |
| private char CurrentChar => _pos < _text.Length ? _text[_pos] : '\0'; | |
| private void Advance() | |
| { | |
| _pos++; | |
| } | |
| // 获取下一个 Token | |
| public Token GetNextToken() | |
| { | |
| while (CurrentChar != '\0') | |
| { | |
| // 跳过空白字符 | |
| if (char.IsWhiteSpace(CurrentChar)) | |
| { | |
| Advance(); | |
| continue; | |
| } | |
| // 处理数字 (简单的整数) | |
| if (char.IsDigit(CurrentChar)) | |
| { | |
| return new Token(TokenType.Integer, ParseNumber()); | |
| } | |
| // 处理标识符或关键字 | |
| if (char.IsLetter(CurrentChar)) | |
| { | |
| return ParseIdentifierOrKeyword(); | |
| } | |
| // 处理字符串字面量 | |
| if (CurrentChar == '"') | |
| { | |
| return ParseString(); | |
| } | |
| // 处理单字符符号 | |
| switch (CurrentChar) | |
| { | |
| case '+': Advance(); return new Token(TokenType.Plus, "+"); | |
| case '-': Advance(); return new Token(TokenType.Minus, "-"); | |
| case '*': Advance(); return new Token(TokenType.Multiply, "*"); | |
| case '/': Advance(); return new Token(TokenType.Divide, "/"); | |
| case '=': Advance(); return new Token(TokenType.Equals, "="); | |
| case '(': Advance(); return new Token(TokenType.LParen, "("); | |
| case ')': Advance(); return new Token(TokenType.RParen, ")"); | |
| default: | |
| string unknown = CurrentChar.ToString(); | |
| Advance(); | |
| return new Token(TokenType.Unknown, unknown); | |
| } | |
| } | |
| return new Token(TokenType.EOF, null); | |
| } | |
| // 解析连续的数字 | |
| private string ParseNumber() | |
| { | |
| StringBuilder sb = new StringBuilder(); | |
| while (CurrentChar != '\0' && char.IsDigit(CurrentChar)) | |
| { | |
| sb.Append(CurrentChar); | |
| Advance(); | |
| } | |
| return sb.ToString(); | |
| } | |
| // 解析标识符,并检查是否为关键字 | |
| private Token ParseIdentifierOrKeyword() | |
| { | |
| StringBuilder sb = new StringBuilder(); | |
| while (CurrentChar != '\0' && (char.IsLetterOrDigit(CurrentChar) || CurrentChar == '_')) | |
| { | |
| sb.Append(CurrentChar); | |
| Advance(); | |
| } | |
| string text = sb.ToString(); | |
| // 检查是否是关键字 | |
| if (Keywords.Contains(text)) | |
| { | |
| return new Token(TokenType.Keyword, text.ToUpper()); // 统一转大写 | |
| } | |
| return new Token(TokenType.Identifier, text); | |
| } | |
| // 解析双引号包裹的字符串 | |
| private Token ParseString() | |
| { | |
| Advance(); // 跳过开头的 " | |
| StringBuilder sb = new StringBuilder(); | |
| while (CurrentChar != '\0' && CurrentChar != '"') | |
| { | |
| sb.Append(CurrentChar); | |
| Advance(); | |
| } | |
| Advance(); // 跳过结尾的 " | |
| return new Token(TokenType.String, sb.ToString()); | |
| } | |
| } | |
| // 4. 测试程序 | |
| class Program | |
| { | |
| static void Main(string[] args) | |
| { | |
| // 一段简单的 BASIC 代码 | |
| string code = @" | |
| 10 LET A = 5 | |
| 20 PRINT ""Result is"" | |
| 30 IF A = 5 THEN GOTO 50 | |
| 40 END | |
| 50 PRINT A + 10 | |
| "; | |
| Console.WriteLine($"Source Code:\n{code}\n"); | |
| Console.WriteLine("Tokens:"); | |
| Console.WriteLine("--------------------------------"); | |
| Lexer lexer = new Lexer(code); | |
| Token token = lexer.GetNextToken(); | |
| while (token.Type != TokenType.EOF) | |
| { | |
| Console.WriteLine(token); | |
| token = lexer.GetNextToken(); | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment