What's a GScanner and how do I use one?

A GScanner will tokenize your text, that is, it'll return an integer for every word or number that appears in its input stream, following certain (customizable) rules to perform this translation. You still need to write the parsing functions on your own though.

Here's a little test program supplied by Tim Janik that will parse

<SYMBOL> = <OPTIONAL-MINUS> <NUMBER> ;

constructs, while skipping "#\n" and "/**/" style comments.

#include <glib.h>

/* some test text to be fed into the scanner */
static const gchar *test_text =
( "ping = 5;\n"
  "/* slide in some \n"
  " * comments, just for the\n"
  " * fun of it \n"
  " */\n"
  "pong = -6; \n"
  "\n"
  "# the next value is a float\n"
  "zonk = 0.7;\n"
  "# redefine ping\n"
  "ping = - 0.5;\n" );

/* define enumeration values to be returned for specific symbols */
enum {
  SYMBOL_PING = G_TOKEN_LAST + 1,
  SYMBOL_PONG = G_TOKEN_LAST + 2,
  SYMBOL_ZONK = G_TOKEN_LAST + 3
};

/* symbol array */
static const struct {
  gchar *symbol_name;
  guint  symbol_token;
} symbols[] = {
  { "ping", SYMBOL_PING, },
  { "pong", SYMBOL_PONG, },
  { "zonk", SYMBOL_ZONK, },
  { NULL, 0, },
}, *symbol_p = symbols;

static gfloat ping = 0;
static gfloat pong = 0;
static gfloat zonk = 0;

static guint
parse_symbol (GScanner *scanner)
{
  guint symbol;
  gboolean negate = FALSE;

  /* expect a valid symbol */
  g_scanner_get_next_token (scanner);
  symbol = scanner->token;
  if (symbol < SYMBOL_PING ||
      symbol > SYMBOL_ZONK)
    return G_TOKEN_SYMBOL;

  /* expect '=' */
  g_scanner_get_next_token (scanner);
  if (scanner->token != '=')
    return '=';

  /* feature optional '-' */
  g_scanner_peek_next_token (scanner);
  if (scanner->next_token == '-')
    {
      g_scanner_get_next_token (scanner);
      negate = !negate;
    }

  /* expect a float (ints are converted to floats on the fly) */
  g_scanner_get_next_token (scanner);
  if (scanner->token != G_TOKEN_FLOAT)
    return G_TOKEN_FLOAT;

  /* make sure the next token is a ';' */
  if (g_scanner_peek_next_token (scanner) != ';')
    {
      /* not so, eat up the non-semicolon and error out */
      g_scanner_get_next_token (scanner);
      return ';';
    }

  /* assign value, eat the semicolon and exit successfully */
  switch (symbol)
    {
    case SYMBOL_PING:
      ping = negate ? - scanner->value.v_float : scanner->value.v_float;
      break;
    case SYMBOL_PONG:
      pong = negate ? - scanner->value.v_float : scanner->value.v_float;
      break;
    case SYMBOL_ZONK:
      zonk = negate ? - scanner->value.v_float : scanner->value.v_float;
      break;
    }
  g_scanner_get_next_token (scanner);

  return G_TOKEN_NONE;
}

int
main (int argc, char *argv[])
{
  GScanner *scanner;
  guint expected_token;

  scanner = g_scanner_new (NULL);

  /* adjust lexing behaviour to suit our needs
   */
  /* convert non-floats (octal values, hex values...) to G_TOKEN_INT */
  scanner->config->numbers_2_int = TRUE;
  /* convert G_TOKEN_INT to G_TOKEN_FLOAT */
  scanner->config->int_2_float = TRUE;
  /* don't return G_TOKEN_SYMBOL, but the symbol's value */
  scanner->config->symbol_2_token = TRUE;

  /* load symbols into the scanner */
  while (symbol_p->symbol_name)
    {
      g_scanner_add_symbol (scanner,
                            symbol_p->symbol_name,
                            GINT_TO_POINTER (symbol_p->symbol_token));
      symbol_p++;
    }

  /* feed in the text */
  g_scanner_input_text (scanner, test_text, strlen (test_text));

  /* give the error handler an idea on how the input is named */
  scanner->input_name = "test text";

  /* scanning loop, we parse the input until its end is reached,
   * the scanner encountered a lexing error, or our sub routine came
   * across invalid syntax
   */
  do
    {
      expected_token = parse_symbol (scanner);
      
      g_scanner_peek_next_token (scanner);
    }
  while (expected_token == G_TOKEN_NONE &&
         scanner->next_token != G_TOKEN_EOF &&
         scanner->next_token != G_TOKEN_ERROR);

  /* give an error message upon syntax errors */
  if (expected_token != G_TOKEN_NONE)
    g_scanner_unexp_token (scanner, expected_token, NULL, "symbol", NULL, NULL, TRUE);

  /* finsish parsing */
  g_scanner_destroy (scanner);

  /* print results */
  g_print ("ping: %f\n", ping);
  g_print ("pong: %f\n", pong);
  g_print ("zonk: %f\n", zonk);
  
  return 0;
}

You need to understand that the scanner will parse its input and tokenize it, it is up to you to interpret these tokens, not define their types before they get parsed, e.g. watch gscanner parse a string:

"hi i am 17"
| | | |
| | | v
| | v TOKEN_INT, value: 17
| v TOKEN_IDENTIFIER, value: "am"
v TOKEN_CHAR, value: 'i'
TOKEN_IDENTIFIER, value: "hi"

If you configure the scanner with:

scanner->config->int_2_float = TRUE;
scanner->config->char_2_token = TRUE;
scanner->config->scan_symbols = TRUE;

and add "am" as a symbol with

g_scanner_add_symbol (scanner, "am", "symbol value");

GScanner will parse it as

"hi i am 17"
| | | |
| | | v
| | v TOKEN_FLOAT, value: 17.0 (automatic int->float conversion)
| | TOKEN_SYMBOL, value: "symbol value" (a successfull hash table lookup
| | turned a TOKEN_IDENTIFIER into a
| | TOKEN_SYMBOL and took over the
| v symbol's value)
v 'i' ('i' can be a valid token as well, as all chars >0 and <256)
TOKEN_IDENTIFIER, value: "hi"

You need to match the token sequence with your code, and if you encounter something that you don't want, you error out:

/* expect an identifier ("hi") */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_IDENTIFIER)
  return G_TOKEN_IDENTIFIER;
/* expect a token 'i' */
g_scanner_get_next_token (scanner);
if (scanner->token != 'i')
  return 'i';
/* expect a symbol ("am") */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_SYMBOL)
  return G_TOKEN_SYMBOL;
/* expect a float (17.0) */
g_scanner_get_next_token (scanner);
if (scanner->token != G_TOKEN_FLOAT)
  return G_TOKEN_FLOAT;

If you got past here, you have parsed "hi i am 17" and would have accepted "dooh i am 42" and "bah i am 0.75" as well, but you would have not accepted "hi 7 am 17" or "hi i hi 17".