Created
March 19, 2025 19:58
-
-
Save nihirash/df0c39c0cbe1e8b4791f49b60041398e to your computer and use it in GitHub Desktop.
Simplest tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #define MAX_TOKEN_SIZE 32 | |
| char str_to_tokenize[] = " x=10 >= 5 <= 6 == 1+2+3>10<100*10/455+(a+b+c)"; | |
| char current_token[MAX_TOKEN_SIZE]; | |
| int position = 0; | |
| char poked = 0; | |
| enum { | |
| Unknown, | |
| Id, | |
| Number, | |
| EOS, // ; | |
| Comma, // , | |
| LP, // ( | |
| RP, // ) | |
| Begin, // { | |
| End, // } | |
| LS, // [ | |
| RS, // ] | |
| Sum, | |
| Dec, | |
| Mul, | |
| Div, | |
| Mod, | |
| And, | |
| Or, | |
| Xor, | |
| Greater, | |
| Less, | |
| Eq, | |
| GE, | |
| LE, | |
| Assign | |
| }; | |
| char get_char() { | |
| char c; | |
| if (poked) { | |
| c = poked; | |
| poked = 0; | |
| return c; | |
| } | |
| if (str_to_tokenize[position] == 0) { | |
| return 0; | |
| } | |
| return str_to_tokenize[position++]; | |
| } | |
| char is_alpha(char c) { | |
| return c >= 'a' && c <='z' || c >='A' && c <='Z'; | |
| } | |
| char is_digit(char c) { | |
| return c >= '0' && c <= '9'; | |
| } | |
| char is_space(char c) { | |
| return c == ' ' || c == '\t' || c == 13 || c == 10; | |
| } | |
| char is_symbol(char c) { | |
| return c == '+' || c == '-' || c == '*' || c == '&' || | |
| c == '|' || c == '>' || c == '<' || c == '=' || | |
| c == '%' || c == '/' || c == '(' || c == ')' || | |
| c == '{' || c == '}' || c == '[' || c == ']' || | |
| c == ',' || c == ';' || c == ':' | |
| ; | |
| } | |
| void skip_spaces() { | |
| char c; | |
| while(1) { | |
| c = get_char(); | |
| if (!is_space(c)) { | |
| poked = c; | |
| return; | |
| } | |
| } | |
| } | |
| void get_id() { | |
| int pos = 0; | |
| char c; | |
| while(1) { | |
| c = get_char(); | |
| if (is_alpha(c) || is_digit(c)) { | |
| current_token[pos++] = c; | |
| } else { | |
| current_token[pos] = 0; | |
| poked = c; | |
| return; | |
| } | |
| } | |
| } | |
| void get_num() { | |
| int pos = 0; | |
| char c; | |
| while(1) { | |
| c = get_char(); | |
| if (is_digit(c)) { | |
| current_token[pos++] = c; | |
| } else { | |
| current_token[pos] = 0; | |
| poked = c; | |
| return; | |
| } | |
| } | |
| } | |
| char process_symbols() { | |
| char c, tmp; | |
| c = get_char(); | |
| switch (c) | |
| { | |
| case '/': return Div; | |
| case '*': return Mul; | |
| case '+': return Sum; | |
| case '-': return Dec; | |
| case '%': return Mod; | |
| case '(': return LP; | |
| case ')': return RP; | |
| case ',': return Comma; | |
| case ';': return EOS; | |
| case '>': | |
| poked = get_char(); | |
| if (poked == '=') { | |
| poked = 0; | |
| return GE; | |
| } | |
| return Greater; | |
| case '<': | |
| poked = get_char(); | |
| if (poked == '=') { | |
| poked = 0; | |
| return LE; | |
| } | |
| return Less; | |
| case '=': | |
| poked = get_char(); | |
| if (poked == '=') { | |
| poked = 0; | |
| return Eq; | |
| } | |
| return Assign; | |
| default: | |
| return Unknown; | |
| } | |
| } | |
| char get_token() { | |
| char c; | |
| current_token[0] = 0; | |
| checks: | |
| skip_spaces(); | |
| c = get_char(); | |
| if (c == '#') { | |
| while (c != 13 && c != 10) { | |
| c = get_char(); | |
| } | |
| goto checks; | |
| } | |
| // Starting from symbols - special cases | |
| if (is_symbol(c)) { | |
| poked = c; | |
| return process_symbols(); | |
| } | |
| // Starting from Alpha - identity | |
| if (is_alpha(c)) { | |
| poked = c; | |
| get_id(); | |
| return Id; | |
| } | |
| // Starting from digit - numberic | |
| if (is_digit(c)) { | |
| poked = c; | |
| get_num(); | |
| return Number; | |
| } | |
| return Unknown; | |
| } | |
| void debug_token(char type) { | |
| switch (type) | |
| { | |
| case Id: | |
| printf("Id"); | |
| break; | |
| case Number: | |
| printf("Num"); | |
| break; | |
| case EOS: | |
| printf(";"); | |
| return; | |
| case Comma: printf("Comma"); return; | |
| case LP: printf("("); return; | |
| case RP: printf(")"); return; | |
| case Begin: printf(" Begin of block "); return; | |
| case End: printf(" End of block "); return; | |
| case LS: printf("[ "); return; | |
| case RS: printf("] "); return; | |
| case Sum: printf("add "); return; | |
| case Dec: printf("sub "); return; | |
| case Mul: printf("mul "); return; | |
| case Div: printf("div "); return; | |
| case Mod: printf(" mod "); return; | |
| case And: printf("Logical And "); return; | |
| case Or: printf("Logical Or "); return; | |
| case Xor: printf("Logical Xor "); return; | |
| case Greater: printf("Greater "); return; | |
| case Less: printf("Less "); return; | |
| case Eq: printf("Equals "); return; | |
| case GE: printf("Greater Or Equal "); return; | |
| case LE: printf("Less Or Equal "); return; | |
| case Assign: printf("Assign "); return; | |
| default: | |
| printf("Unknown type: "); | |
| break; | |
| } | |
| printf("(%s) ", current_token); | |
| } | |
| int main() { | |
| char t; | |
| t = get_token(); | |
| while (t != Unknown) { | |
| debug_token(t); | |
| t = get_token(); | |
| } | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment