acwj 01 Lexical Scanner

2025-11-05

/posts/acwj-1/ map[location:China name:Melaton]

Table of Contents

分词器 Lexer

对于编译器来说，想要识别特定的语法和逻辑，就需要用到把代码分成有意义的每一小节，这一小节被称为 token，而识别 token 的编译器部分被称为 lexical analyzer 或 lexer

Lexer 是编译器编译的第一步

本文将从零构建一个能识别 + + * / 的扫描器

# Token

在这里，token被定义为

struct token {
  int token;      // what type of token this is
  int intvalue;   // value, if it’s an integer literal
};

而 int token 则为

enum {
  T_PLUS, T_MINUS, T_STAR, T_SLASH, T_INTLIT
};

# 核心 scan()

scan() 是这个示例的核心，我们将一步步构建

## next()

这个函数的作用是从文件顺序读取下一个字符，有两个核心变量int Putback 和 Line

在现在的编译器中，Lexer需要一个一个字读字符，但有时候会多读一个。多读的这一个不能丢弃，因为它是下一个token的开头，放回去后我们在下次就可以读到它

int Putback 是一个单字符缓冲区，用于把多读出来的字符放回去

int Line 是行数

int Putback;
int Line;
FILE *Infile;

static int next(void) {
    int c;

    // If there's value in Putback, use it
    if (Putback) {
        c = Putback;
        Putback = 0;
        return c;
    }

    // Otherwise, read next character
    c = fgetc(Infile);

    // Add Line number
    if ('\n' == c) {
        Line++;
    }

    return c;
}

## 跳过空格

空格不影响数学计算，遂跳过

static int skip(void) {
  int c = next();
  while (isspace(c))
    c = next();
  return c;
}

## 扫描数字

数字有可能不是一位数，因此我们要读取完整数字

// Return subscript
// Find position of c in s
static int chrpos(char *s, int c) {
    char *p = strchr(s, c);
    return (p ? p - s : -1);
}

// Turn string into decimals
static int scanint(int c) {
    int k, val = 0;

    while ((k = chrpos("0123456789", c)) >= 0) {
        val = val * 10 + k;
        c = next();
    }

    putback(c);
    return val;
}

## scan()

scan函数用于读取字符，填充 token struct

int scan(struct token *t) {
    int c = skip();

    switch (c) {
        case EOF: return 0;
        case '+': t->token = T_PLUS; break;
        case '-': t->token = T_MINUS; break;
        case '*': t->token = T_STAR; break;
        case '/': t->token = T_SLASH; break;
        default:
            if (isdigit(c)) {
                t->token = T_INTLIT;
                t->intvalue = scanint(c);
                break;
            }

            printf("Unrecognized token %c on Line %d", c, Line);
            exit(1);
    }

    return 1;
}

样例输入

12+1*2

样例输出

Token intlit, value 12
Token +
Token intlit, value 1
Token *
Token intlit, value 2

# 完整代码

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int Putback;
int Line;
FILE *Infile;

struct token {
    int token;      // what type of token this is
    int intvalue;   // value, if it’s an integer literal
};

enum {
    T_PLUS, T_MINUS, T_STAR, T_SLASH, T_INTLIT
  };

static int next(void) {
    int c;

    // If there's value in Putback, use it
    if (Putback) {
        c = Putback;
        Putback = 0;
        return c;
    }

    // Otherwise, read next character
    c = fgetc(Infile);

    // Add Line number
    if ('\n' == c)
        Line++;

    return c;
}

static int skip(void) {
    int c = next();
    while (isspace(c))
        c = next();
    return c;
}

static void putback(int c) {
    Putback = c;
}

static int chrpos(char *s, int c) {
    char *p = strchr(s, c);
    return (p ? p - s : -1);
}

static int scanint(int c) {
    int k, val = 0;

    while ((k = chrpos("0123456789", c)) >= 0) {
        val = val * 10 + k;
        c = next();
    }

    putback(c);
    return val;
}

int scan(struct token *t) {
    int c = skip();

    switch (c) {
        case EOF: return 0;
        case '+': t->token = T_PLUS; break;
        case '-': t->token = T_MINUS; break;
        case '*': t->token = T_STAR; break;
        case '/': t->token = T_SLASH; break;
        default:
            if (isdigit(c)) {
                t->token = T_INTLIT;
                t->intvalue = scanint(c);
                break;
            }

            printf("Unrecognized token %c on Line %d", c, Line);
            exit(1);
    }

    return 1;
}

static void scanfile() {
    struct token T;
    char *tokstr[] = { "+", "-", "*", "/", "intlit" };

    while (scan(&T)) {
        printf("Token %s", tokstr[T.token]);
        if (T.token == T_INTLIT)
            printf(", value %d", T.intvalue);
        printf("\n");
    }
}

int main(int argc, char *argv[]) {
    Line = 1;
    Putback = '\n';
    Infile = fopen(argv[1], "r");
    scanfile();
    return 0;
}