Skip to content

Instantly share code, notes, and snippets.

@nihirash
Created March 19, 2025 19:58
Show Gist options
  • Select an option

  • Save nihirash/df0c39c0cbe1e8b4791f49b60041398e to your computer and use it in GitHub Desktop.

Select an option

Save nihirash/df0c39c0cbe1e8b4791f49b60041398e to your computer and use it in GitHub Desktop.
Simplest tokenizer
#include <stdio.h>
#include <stdlib.h>
#define MAX_TOKEN_SIZE 32
char str_to_tokenize[] = " x=10 >= 5 <= 6 == 1+2+3>10<100*10/455+(a+b+c)";
char current_token[MAX_TOKEN_SIZE];
int position = 0;
char poked = 0;
enum {
Unknown,
Id,
Number,
EOS, // ;
Comma, // ,
LP, // (
RP, // )
Begin, // {
End, // }
LS, // [
RS, // ]
Sum,
Dec,
Mul,
Div,
Mod,
And,
Or,
Xor,
Greater,
Less,
Eq,
GE,
LE,
Assign
};
char get_char() {
char c;
if (poked) {
c = poked;
poked = 0;
return c;
}
if (str_to_tokenize[position] == 0) {
return 0;
}
return str_to_tokenize[position++];
}
char is_alpha(char c) {
return c >= 'a' && c <='z' || c >='A' && c <='Z';
}
char is_digit(char c) {
return c >= '0' && c <= '9';
}
char is_space(char c) {
return c == ' ' || c == '\t' || c == 13 || c == 10;
}
char is_symbol(char c) {
return c == '+' || c == '-' || c == '*' || c == '&' ||
c == '|' || c == '>' || c == '<' || c == '=' ||
c == '%' || c == '/' || c == '(' || c == ')' ||
c == '{' || c == '}' || c == '[' || c == ']' ||
c == ',' || c == ';' || c == ':'
;
}
void skip_spaces() {
char c;
while(1) {
c = get_char();
if (!is_space(c)) {
poked = c;
return;
}
}
}
void get_id() {
int pos = 0;
char c;
while(1) {
c = get_char();
if (is_alpha(c) || is_digit(c)) {
current_token[pos++] = c;
} else {
current_token[pos] = 0;
poked = c;
return;
}
}
}
void get_num() {
int pos = 0;
char c;
while(1) {
c = get_char();
if (is_digit(c)) {
current_token[pos++] = c;
} else {
current_token[pos] = 0;
poked = c;
return;
}
}
}
char process_symbols() {
char c, tmp;
c = get_char();
switch (c)
{
case '/': return Div;
case '*': return Mul;
case '+': return Sum;
case '-': return Dec;
case '%': return Mod;
case '(': return LP;
case ')': return RP;
case ',': return Comma;
case ';': return EOS;
case '>':
poked = get_char();
if (poked == '=') {
poked = 0;
return GE;
}
return Greater;
case '<':
poked = get_char();
if (poked == '=') {
poked = 0;
return LE;
}
return Less;
case '=':
poked = get_char();
if (poked == '=') {
poked = 0;
return Eq;
}
return Assign;
default:
return Unknown;
}
}
char get_token() {
char c;
current_token[0] = 0;
checks:
skip_spaces();
c = get_char();
if (c == '#') {
while (c != 13 && c != 10) {
c = get_char();
}
goto checks;
}
// Starting from symbols - special cases
if (is_symbol(c)) {
poked = c;
return process_symbols();
}
// Starting from Alpha - identity
if (is_alpha(c)) {
poked = c;
get_id();
return Id;
}
// Starting from digit - numberic
if (is_digit(c)) {
poked = c;
get_num();
return Number;
}
return Unknown;
}
void debug_token(char type) {
switch (type)
{
case Id:
printf("Id");
break;
case Number:
printf("Num");
break;
case EOS:
printf(";");
return;
case Comma: printf("Comma"); return;
case LP: printf("("); return;
case RP: printf(")"); return;
case Begin: printf(" Begin of block "); return;
case End: printf(" End of block "); return;
case LS: printf("[ "); return;
case RS: printf("] "); return;
case Sum: printf("add "); return;
case Dec: printf("sub "); return;
case Mul: printf("mul "); return;
case Div: printf("div "); return;
case Mod: printf(" mod "); return;
case And: printf("Logical And "); return;
case Or: printf("Logical Or "); return;
case Xor: printf("Logical Xor "); return;
case Greater: printf("Greater "); return;
case Less: printf("Less "); return;
case Eq: printf("Equals "); return;
case GE: printf("Greater Or Equal "); return;
case LE: printf("Less Or Equal "); return;
case Assign: printf("Assign "); return;
default:
printf("Unknown type: ");
break;
}
printf("(%s) ", current_token);
}
int main() {
char t;
t = get_token();
while (t != Unknown) {
debug_token(t);
t = get_token();
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment