From c257188cd912503371db1c2b7b2c59b4fd53df1c Mon Sep 17 00:00:00 2001
From: midipix <writeonce@midipix.org>
Date: Wed, 15 May 2024 20:40:33 +0000
Subject: regex module: implemented token scanners, added definitions and scan
 table.

---
 src/regex/tbnf_regex.c         | 185 ++++++++++++++++++++++++++
 src/regex/tbnf_regex_defs.h    |  64 +++++++++
 src/regex/tbnf_regex_scanfns.h | 287 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 536 insertions(+)
 create mode 100644 src/regex/tbnf_regex.c
 create mode 100644 src/regex/tbnf_regex_defs.h
 create mode 100644 src/regex/tbnf_regex_scanfns.h

(limited to 'src/regex')

diff --git a/src/regex/tbnf_regex.c b/src/regex/tbnf_regex.c
new file mode 100644
index 0000000..2cdcb48
--- /dev/null
+++ b/src/regex/tbnf_regex.c
@@ -0,0 +1,185 @@
+/**************************************************************/
+/*  treebnf: a tree oriented bnf library                      */
+/*  Copyright (C) 2024  SysDeer Technologies, LLC             */
+/*  Released under GPLv2 and GPLv3; see COPYING.TREEBNF.      */
+/**************************************************************/
+
+#include <treebnf/treebnf.h>
+
+#include "treebnf_regex_impl.h"
+#include "treebnf_visibility_impl.h"
+
+#include "tbnf_regex_defs.h"
+#include "tbnf_regex_scanfns.h"
+
+#define TBNF_STATE_STACK_SIZE   (512)
+
+/* init state scan table*/
+static struct tbnf_scan_tbl tbnf_regex_scan_tbl__init[TBNF_REGEX_TOK_CAP] = {
+	/* --> brace */
+	[TBNF_REGEX_TOK_LBRACE] = {
+		.tok_scan_fn    = tbnf_regex_scan_lbrace,
+		.tok_state_op   = TBNF_STATE_PUSH,
+		.tok_state_next = TBNF_REGEX_STATE_BRACE,
+	},
+
+
+	/* --> bracket */
+	[TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX_RBRACKET] = {
+		.tok_scan_fn    = tbnf_regex_scan_lbracket_circumflex_rbracket,
+		.tok_state_op   = TBNF_STATE_PUSH,
+		.tok_state_next = TBNF_REGEX_STATE_BRACKET,
+	},
+
+	[TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX] = {
+		.tok_scan_fn    = tbnf_regex_scan_lbracket_circumflex,
+		.tok_state_op   = TBNF_STATE_PUSH,
+		.tok_state_next = TBNF_REGEX_STATE_BRACKET,
+	},
+
+	[TBNF_REGEX_TOK_LBRACKET_RBRACKET] = {
+		.tok_scan_fn    = tbnf_regex_scan_lbracket_rbracket,
+		.tok_state_op   = TBNF_STATE_PUSH,
+		.tok_state_next = TBNF_REGEX_STATE_BRACKET,
+	},
+
+	[TBNF_REGEX_TOK_LBRACKET] = {
+		.tok_scan_fn    = tbnf_regex_scan_lbracket,
+		.tok_state_op   = TBNF_STATE_PUSH,
+		.tok_state_next = TBNF_REGEX_STATE_BRACKET,
+	},
+
+
+	/* (expression) */
+	[TBNF_REGEX_TOK_ESCAPED_CHAR] = {
+		.tok_scan_fn    = tbnf_regex_scan_escaped_char,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_CIRCUMFLEX_ASTERISK] = {
+		.tok_scan_fn    = tbnf_regex_scan_circumflex_asterisk,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_CIRCUMFLEX] = {
+		.tok_scan_fn    = tbnf_regex_scan_circumflex,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_LPAREN] = {
+		.tok_scan_fn    = tbnf_regex_scan_lparen,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_RPAREN] = {
+		.tok_scan_fn    = tbnf_regex_scan_rparen,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_ASTERISK] = {
+		.tok_scan_fn    = tbnf_regex_scan_asterisk,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_PERIOD] = {
+		.tok_scan_fn    = tbnf_regex_scan_period,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_DOLLAR] = {
+		.tok_scan_fn    = tbnf_regex_scan_dollar,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_VLINE] = {
+		.tok_scan_fn    = tbnf_regex_scan_vline,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_QMARK] = {
+		.tok_scan_fn    = tbnf_regex_scan_qmark,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_PLUS] = {
+		.tok_scan_fn    = tbnf_regex_scan_plus,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_CHAR] = {
+		.tok_scan_fn    = tbnf_regex_scan_char,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+};
+
+
+/* brace state scan table*/
+static struct tbnf_scan_tbl tbnf_regex_scan_tbl__brace[TBNF_REGEX_TOK_CAP] = {
+	[TBNF_REGEX_TOK_BRACE_RBRACE] = {
+		.tok_scan_fn    = tbnf_regex_scan_brace_rbrace,
+		.tok_state_op   = TBNF_STATE_POP,
+	},
+
+	[TBNF_REGEX_TOK_BRACE_DIGIT] = {
+		.tok_scan_fn    = tbnf_regex_scan_brace_digit,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_BRACE_COMMA] = {
+		.tok_scan_fn    = tbnf_regex_scan_brace_comma,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+};
+
+
+/* bracket state scan table*/
+static struct tbnf_scan_tbl tbnf_regex_scan_tbl__bracket[TBNF_REGEX_TOK_CAP] = {
+	[TBNF_REGEX_TOK_BRACKET_RBRACKET] = {
+		.tok_scan_fn    = tbnf_regex_scan_bracket_rbracket,
+		.tok_state_op   = TBNF_STATE_POP,
+	},
+
+	[TBNF_REGEX_TOK_BRACKET_ESCAPED_CHAR] = {
+		.tok_scan_fn    = tbnf_regex_scan_bracket_escaped_char,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_BRACKET_CHARACTER_CLASS] = {
+		.tok_scan_fn    = tbnf_regex_scan_bracket_character_class,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_BRACKET_COLLATION_SYMBOL] = {
+		.tok_scan_fn    = tbnf_regex_scan_bracket_collation_symbol,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_BRACKET_EQUIVALENCE_CLASS] = {
+		.tok_scan_fn    = tbnf_regex_scan_bracket_equivalence_class,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_BRACKET_HYPHEN] = {
+		.tok_scan_fn    = tbnf_regex_scan_bracket_hyphen,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_BRACKET_ERROR] = {
+		.tok_scan_fn    = tbnf_regex_scan_bracket_error,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+
+	[TBNF_REGEX_TOK_BRACKET_CHAR] = {
+		.tok_scan_fn    = tbnf_regex_scan_char,
+		.tok_state_op   = TBNF_STATE_KEEP,
+	},
+};
+
+
+/* extended regex expression token scan table */
+tbnf_hidden struct tbnf_scan_tbl * tbnf_regex_scan_tbl[] = {
+	[TBNF_REGEX_STATE_INIT]    = tbnf_regex_scan_tbl__init,
+	[TBNF_REGEX_STATE_BRACE]   = tbnf_regex_scan_tbl__brace,
+	[TBNF_REGEX_STATE_BRACKET] = tbnf_regex_scan_tbl__bracket,
+	[TBNF_REGEX_STATE_CAP]     = 0,
+};
diff --git a/src/regex/tbnf_regex_defs.h b/src/regex/tbnf_regex_defs.h
new file mode 100644
index 0000000..9276ae0
--- /dev/null
+++ b/src/regex/tbnf_regex_defs.h
@@ -0,0 +1,64 @@
+/**************************************************************/
+/*  treebnf: a tree oriented bnf library                      */
+/*  Copyright (C) 2024  SysDeer Technologies, LLC             */
+/*  Released under GPLv2 and GPLv3; see COPYING.TREEBNF.      */
+/**************************************************************/
+
+#ifndef TBNF_REGEX_DEFS_H
+#define TBNF_REGEX_DEFS_H
+
+enum tbnf_regex_scan_state {
+	TBNF_REGEX_STATE_INIT,
+	TBNF_REGEX_STATE_BRACE,
+	TBNF_REGEX_STATE_BRACKET,
+	TBNF_REGEX_STATE_CAP,
+};
+
+enum tbnf_regex_token_type {
+	TBNF_REGEX_TOK_NONE,
+
+	/* in-brace tokens */
+	TBNF_REGEX_TOK_BRACE_RBRACE,
+	TBNF_REGEX_TOK_BRACE_DIGIT,
+	TBNF_REGEX_TOK_BRACE_COMMA,
+
+	/* in-bracket tokens */
+	TBNF_REGEX_TOK_BRACKET_ESCAPED_CHAR,
+	TBNF_REGEX_TOK_BRACKET_CHARACTER_CLASS,
+	TBNF_REGEX_TOK_BRACKET_COLLATION_SYMBOL,
+	TBNF_REGEX_TOK_BRACKET_EQUIVALENCE_CLASS,
+	TBNF_REGEX_TOK_BRACKET_RBRACKET,
+	TBNF_REGEX_TOK_BRACKET_HYPHEN,
+	TBNF_REGEX_TOK_BRACKET_ERROR,
+	TBNF_REGEX_TOK_BRACKET_CHAR,
+
+	/* brace state initializer token */
+	TBNF_REGEX_TOK_LBRACE,
+
+	/* bracket state initializer tokens */
+	TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX_RBRACKET,
+	TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX,
+	TBNF_REGEX_TOK_LBRACKET_RBRACKET,
+	TBNF_REGEX_TOK_LBRACKET,
+
+	/* init state tokens */
+	TBNF_REGEX_TOK_ESCAPED_CHAR,
+
+	TBNF_REGEX_TOK_CIRCUMFLEX_ASTERISK,
+	TBNF_REGEX_TOK_CIRCUMFLEX,
+
+	TBNF_REGEX_TOK_LPAREN,
+	TBNF_REGEX_TOK_RPAREN,
+
+	TBNF_REGEX_TOK_ASTERISK,
+	TBNF_REGEX_TOK_PERIOD,
+	TBNF_REGEX_TOK_DOLLAR,
+	TBNF_REGEX_TOK_VLINE,
+	TBNF_REGEX_TOK_QMARK,
+	TBNF_REGEX_TOK_PLUS,
+	TBNF_REGEX_TOK_CHAR,
+
+	TBNF_REGEX_TOK_CAP,
+};
+
+#endif
diff --git a/src/regex/tbnf_regex_scanfns.h b/src/regex/tbnf_regex_scanfns.h
new file mode 100644
index 0000000..c57a606
--- /dev/null
+++ b/src/regex/tbnf_regex_scanfns.h
@@ -0,0 +1,287 @@
+/**************************************************************/
+/*  treebnf: a tree oriented bnf library                      */
+/*  Copyright (C) 2024  SysDeer Technologies, LLC             */
+/*  Released under GPLv2 and GPLv3; see COPYING.TREEBNF.      */
+/**************************************************************/
+
+#ifndef TBNF_REGEX_SCANFNS_H
+#define TBNF_REGEX_SCANFNS_H
+
+#include <treebnf/treebnf.h>
+
+/* in-brace scanners */
+static inline int tbnf_regex_scan_brace_rbrace(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '}')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_brace_digit(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] >= '0')
+		if (sctx->tok_scan_mark[0] <= '9')
+			return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_brace_comma(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == ',')
+		return 1;
+
+	return -1;
+}
+
+
+/* in-bracket scanners */
+static inline int tbnf_regex_scan_bracket_escaped_char(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '\\')
+		if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+			if (sctx->tok_scan_mark[1])
+				return 2;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_character_class(const struct tbnf_scan_ctx * sctx)
+{
+	const char * ch = 0;
+
+	if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap)
+		return -1;
+
+	if (sctx->tok_scan_mark[0] == '[')
+		if (sctx->tok_scan_mark[1] == ':')
+			ch = &sctx->tok_scan_mark[2];
+
+	if (!ch)
+		return -1;
+
+	for (; (*ch >= 'a') && (*ch <= 'z') && (&ch[2] < sctx->tok_scan_cap); )
+		ch++;
+
+	if ((*ch++ == ':') && (*ch++ == ']'))
+			return (ch - sctx->tok_scan_mark);
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_collation_symbol(const struct tbnf_scan_ctx * sctx)
+{
+	if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap)
+		return -1;
+
+	if (sctx->tok_scan_mark[0] == '[')
+		if (sctx->tok_scan_mark[1] == '.')
+			if (sctx->tok_scan_mark[3] == '.')
+				if (sctx->tok_scan_mark[4] == ']')
+					if (sctx->tok_scan_mark[2])
+						return 5;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_equivalence_class(const struct tbnf_scan_ctx * sctx)
+{
+	const char * ch = 0;
+
+	if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap)
+		return -1;
+
+	if (sctx->tok_scan_mark[0] == '[')
+		if (sctx->tok_scan_mark[1] == '=')
+			ch = &sctx->tok_scan_mark[2];
+
+	if (!ch)
+		return -1;
+
+	for (; (*ch >= 'a') && (*ch <= 'z') && (&ch[2] < sctx->tok_scan_cap); )
+		ch++;
+
+	if ((*ch++ == '=') && (*ch++ == ']'))
+			return (ch - sctx->tok_scan_mark);
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_rbracket(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == ']')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_hyphen(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '-')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_error(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '[')
+		return 1;
+
+	return -1;
+}
+
+
+
+/* bracket state entry scanners */
+static inline int tbnf_regex_scan_lbracket_circumflex_rbracket(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '[')
+		if (&sctx->tok_scan_mark[2] < sctx->tok_scan_cap)
+			if (sctx->tok_scan_mark[1] == '^')
+				if (sctx->tok_scan_mark[2] == ']')
+					return 3;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_lbracket_circumflex(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '[')
+		if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+			if (sctx->tok_scan_mark[1] == '^')
+				return 2;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_lbracket_rbracket(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '[')
+		if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+			if (sctx->tok_scan_mark[1] == ']')
+				return 2;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_lbracket(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '[')
+		return 1;
+
+	return -1;
+}
+
+
+/* common scanners */
+static inline int tbnf_regex_scan_escaped_char(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '\\')
+		if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+			if (sctx->tok_scan_mark[1])
+				return 2;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_circumflex_asterisk(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '^')
+		if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+			if (sctx->tok_scan_mark[1] == '*')
+				return 2;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_circumflex(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '^')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_lbrace(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '{')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_lparen(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '(')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_rparen(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == ')')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_asterisk(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '*')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_period(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '.')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_dollar(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '$')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_vline(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '|')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_qmark(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '?')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_plus(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0] == '+')
+		return 1;
+
+	return -1;
+}
+
+static inline int tbnf_regex_scan_char(const struct tbnf_scan_ctx * sctx)
+{
+	if (sctx->tok_scan_mark[0])
+		return 1;
+
+	return -1;
+}
+
+#endif
-- 
cgit v1.2.3