summaryrefslogtreecommitdiffhomepage
path: root/src/regex
diff options
context:
space:
mode:
authormidipix <writeonce@midipix.org>2024-05-15 20:40:33 +0000
committermidipix <writeonce@midipix.org>2024-05-15 20:40:33 +0000
commitc257188cd912503371db1c2b7b2c59b4fd53df1c (patch)
tree8077d9f09fa85521f53a94f3fa33d17a7975a11c /src/regex
parentae7810f56e1daa1d2e35c06969c26835c1ed7800 (diff)
downloadtreebnf-c257188cd912503371db1c2b7b2c59b4fd53df1c.tar.bz2
treebnf-c257188cd912503371db1c2b7b2c59b4fd53df1c.tar.xz
regex module: implemented token scanners, added definitions and scan table.
Diffstat (limited to 'src/regex')
-rw-r--r--src/regex/tbnf_regex.c185
-rw-r--r--src/regex/tbnf_regex_defs.h64
-rw-r--r--src/regex/tbnf_regex_scanfns.h287
3 files changed, 536 insertions, 0 deletions
diff --git a/src/regex/tbnf_regex.c b/src/regex/tbnf_regex.c
new file mode 100644
index 0000000..2cdcb48
--- /dev/null
+++ b/src/regex/tbnf_regex.c
@@ -0,0 +1,185 @@
+/**************************************************************/
+/* treebnf: a tree oriented bnf library */
+/* Copyright (C) 2024 SysDeer Technologies, LLC */
+/* Released under GPLv2 and GPLv3; see COPYING.TREEBNF. */
+/**************************************************************/
+
+#include <treebnf/treebnf.h>
+
+#include "treebnf_regex_impl.h"
+#include "treebnf_visibility_impl.h"
+
+#include "tbnf_regex_defs.h"
+#include "tbnf_regex_scanfns.h"
+
+#define TBNF_STATE_STACK_SIZE (512)
+
+/* init state scan table*/
+static struct tbnf_scan_tbl tbnf_regex_scan_tbl__init[TBNF_REGEX_TOK_CAP] = {
+ /* --> brace */
+ [TBNF_REGEX_TOK_LBRACE] = {
+ .tok_scan_fn = tbnf_regex_scan_lbrace,
+ .tok_state_op = TBNF_STATE_PUSH,
+ .tok_state_next = TBNF_REGEX_STATE_BRACE,
+ },
+
+
+ /* --> bracket */
+ [TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX_RBRACKET] = {
+ .tok_scan_fn = tbnf_regex_scan_lbracket_circumflex_rbracket,
+ .tok_state_op = TBNF_STATE_PUSH,
+ .tok_state_next = TBNF_REGEX_STATE_BRACKET,
+ },
+
+ [TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX] = {
+ .tok_scan_fn = tbnf_regex_scan_lbracket_circumflex,
+ .tok_state_op = TBNF_STATE_PUSH,
+ .tok_state_next = TBNF_REGEX_STATE_BRACKET,
+ },
+
+ [TBNF_REGEX_TOK_LBRACKET_RBRACKET] = {
+ .tok_scan_fn = tbnf_regex_scan_lbracket_rbracket,
+ .tok_state_op = TBNF_STATE_PUSH,
+ .tok_state_next = TBNF_REGEX_STATE_BRACKET,
+ },
+
+ [TBNF_REGEX_TOK_LBRACKET] = {
+ .tok_scan_fn = tbnf_regex_scan_lbracket,
+ .tok_state_op = TBNF_STATE_PUSH,
+ .tok_state_next = TBNF_REGEX_STATE_BRACKET,
+ },
+
+
+ /* (expression) */
+ [TBNF_REGEX_TOK_ESCAPED_CHAR] = {
+ .tok_scan_fn = tbnf_regex_scan_escaped_char,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_CIRCUMFLEX_ASTERISK] = {
+ .tok_scan_fn = tbnf_regex_scan_circumflex_asterisk,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_CIRCUMFLEX] = {
+ .tok_scan_fn = tbnf_regex_scan_circumflex,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_LPAREN] = {
+ .tok_scan_fn = tbnf_regex_scan_lparen,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_RPAREN] = {
+ .tok_scan_fn = tbnf_regex_scan_rparen,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_ASTERISK] = {
+ .tok_scan_fn = tbnf_regex_scan_asterisk,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_PERIOD] = {
+ .tok_scan_fn = tbnf_regex_scan_period,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_DOLLAR] = {
+ .tok_scan_fn = tbnf_regex_scan_dollar,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_VLINE] = {
+ .tok_scan_fn = tbnf_regex_scan_vline,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_QMARK] = {
+ .tok_scan_fn = tbnf_regex_scan_qmark,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_PLUS] = {
+ .tok_scan_fn = tbnf_regex_scan_plus,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_CHAR] = {
+ .tok_scan_fn = tbnf_regex_scan_char,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+};
+
+
+/* brace state scan table*/
+static struct tbnf_scan_tbl tbnf_regex_scan_tbl__brace[TBNF_REGEX_TOK_CAP] = {
+ [TBNF_REGEX_TOK_BRACE_RBRACE] = {
+ .tok_scan_fn = tbnf_regex_scan_brace_rbrace,
+ .tok_state_op = TBNF_STATE_POP,
+ },
+
+ [TBNF_REGEX_TOK_BRACE_DIGIT] = {
+ .tok_scan_fn = tbnf_regex_scan_brace_digit,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_BRACE_COMMA] = {
+ .tok_scan_fn = tbnf_regex_scan_brace_comma,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+};
+
+
+/* bracket state scan table*/
+static struct tbnf_scan_tbl tbnf_regex_scan_tbl__bracket[TBNF_REGEX_TOK_CAP] = {
+ [TBNF_REGEX_TOK_BRACKET_RBRACKET] = {
+ .tok_scan_fn = tbnf_regex_scan_bracket_rbracket,
+ .tok_state_op = TBNF_STATE_POP,
+ },
+
+ [TBNF_REGEX_TOK_BRACKET_ESCAPED_CHAR] = {
+ .tok_scan_fn = tbnf_regex_scan_bracket_escaped_char,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_BRACKET_CHARACTER_CLASS] = {
+ .tok_scan_fn = tbnf_regex_scan_bracket_character_class,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_BRACKET_COLLATION_SYMBOL] = {
+ .tok_scan_fn = tbnf_regex_scan_bracket_collation_symbol,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_BRACKET_EQUIVALENCE_CLASS] = {
+ .tok_scan_fn = tbnf_regex_scan_bracket_equivalence_class,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_BRACKET_HYPHEN] = {
+ .tok_scan_fn = tbnf_regex_scan_bracket_hyphen,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_BRACKET_ERROR] = {
+ .tok_scan_fn = tbnf_regex_scan_bracket_error,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+
+ [TBNF_REGEX_TOK_BRACKET_CHAR] = {
+ .tok_scan_fn = tbnf_regex_scan_char,
+ .tok_state_op = TBNF_STATE_KEEP,
+ },
+};
+
+
+/* extended regex expression token scan table */
+tbnf_hidden struct tbnf_scan_tbl * tbnf_regex_scan_tbl[] = {
+ [TBNF_REGEX_STATE_INIT] = tbnf_regex_scan_tbl__init,
+ [TBNF_REGEX_STATE_BRACE] = tbnf_regex_scan_tbl__brace,
+ [TBNF_REGEX_STATE_BRACKET] = tbnf_regex_scan_tbl__bracket,
+ [TBNF_REGEX_STATE_CAP] = 0,
+};
diff --git a/src/regex/tbnf_regex_defs.h b/src/regex/tbnf_regex_defs.h
new file mode 100644
index 0000000..9276ae0
--- /dev/null
+++ b/src/regex/tbnf_regex_defs.h
@@ -0,0 +1,64 @@
+/**************************************************************/
+/* treebnf: a tree oriented bnf library */
+/* Copyright (C) 2024 SysDeer Technologies, LLC */
+/* Released under GPLv2 and GPLv3; see COPYING.TREEBNF. */
+/**************************************************************/
+
+#ifndef TBNF_REGEX_DEFS_H
+#define TBNF_REGEX_DEFS_H
+
+enum tbnf_regex_scan_state {
+ TBNF_REGEX_STATE_INIT,
+ TBNF_REGEX_STATE_BRACE,
+ TBNF_REGEX_STATE_BRACKET,
+ TBNF_REGEX_STATE_CAP,
+};
+
+enum tbnf_regex_token_type {
+ TBNF_REGEX_TOK_NONE,
+
+ /* in-brace tokens */
+ TBNF_REGEX_TOK_BRACE_RBRACE,
+ TBNF_REGEX_TOK_BRACE_DIGIT,
+ TBNF_REGEX_TOK_BRACE_COMMA,
+
+ /* in-bracket tokens */
+ TBNF_REGEX_TOK_BRACKET_ESCAPED_CHAR,
+ TBNF_REGEX_TOK_BRACKET_CHARACTER_CLASS,
+ TBNF_REGEX_TOK_BRACKET_COLLATION_SYMBOL,
+ TBNF_REGEX_TOK_BRACKET_EQUIVALENCE_CLASS,
+ TBNF_REGEX_TOK_BRACKET_RBRACKET,
+ TBNF_REGEX_TOK_BRACKET_HYPHEN,
+ TBNF_REGEX_TOK_BRACKET_ERROR,
+ TBNF_REGEX_TOK_BRACKET_CHAR,
+
+ /* brace state initializer token */
+ TBNF_REGEX_TOK_LBRACE,
+
+ /* bracket state initializer tokens */
+ TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX_RBRACKET,
+ TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX,
+ TBNF_REGEX_TOK_LBRACKET_RBRACKET,
+ TBNF_REGEX_TOK_LBRACKET,
+
+ /* init state tokens */
+ TBNF_REGEX_TOK_ESCAPED_CHAR,
+
+ TBNF_REGEX_TOK_CIRCUMFLEX_ASTERISK,
+ TBNF_REGEX_TOK_CIRCUMFLEX,
+
+ TBNF_REGEX_TOK_LPAREN,
+ TBNF_REGEX_TOK_RPAREN,
+
+ TBNF_REGEX_TOK_ASTERISK,
+ TBNF_REGEX_TOK_PERIOD,
+ TBNF_REGEX_TOK_DOLLAR,
+ TBNF_REGEX_TOK_VLINE,
+ TBNF_REGEX_TOK_QMARK,
+ TBNF_REGEX_TOK_PLUS,
+ TBNF_REGEX_TOK_CHAR,
+
+ TBNF_REGEX_TOK_CAP,
+};
+
+#endif
diff --git a/src/regex/tbnf_regex_scanfns.h b/src/regex/tbnf_regex_scanfns.h
new file mode 100644
index 0000000..c57a606
--- /dev/null
+++ b/src/regex/tbnf_regex_scanfns.h
@@ -0,0 +1,287 @@
+/**************************************************************/
+/* treebnf: a tree oriented bnf library */
+/* Copyright (C) 2024 SysDeer Technologies, LLC */
+/* Released under GPLv2 and GPLv3; see COPYING.TREEBNF. */
+/**************************************************************/
+
+#ifndef TBNF_REGEX_SCANFNS_H
+#define TBNF_REGEX_SCANFNS_H
+
+#include <treebnf/treebnf.h>
+
+/* in-brace scanners */
+static inline int tbnf_regex_scan_brace_rbrace(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '}')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_brace_digit(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] >= '0')
+ if (sctx->tok_scan_mark[0] <= '9')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_brace_comma(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == ',')
+ return 1;
+
+ return -1;
+}
+
+
+/* in-bracket scanners */
+static inline int tbnf_regex_scan_bracket_escaped_char(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '\\')
+ if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+ if (sctx->tok_scan_mark[1])
+ return 2;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_character_class(const struct tbnf_scan_ctx * sctx)
+{
+ const char * ch = 0;
+
+ if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap)
+ return -1;
+
+ if (sctx->tok_scan_mark[0] == '[')
+ if (sctx->tok_scan_mark[1] == ':')
+ ch = &sctx->tok_scan_mark[2];
+
+ if (!ch)
+ return -1;
+
+ for (; (*ch >= 'a') && (*ch <= 'z') && (&ch[2] < sctx->tok_scan_cap); )
+ ch++;
+
+ if ((*ch++ == ':') && (*ch++ == ']'))
+ return (ch - sctx->tok_scan_mark);
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_collation_symbol(const struct tbnf_scan_ctx * sctx)
+{
+ if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap)
+ return -1;
+
+ if (sctx->tok_scan_mark[0] == '[')
+ if (sctx->tok_scan_mark[1] == '.')
+ if (sctx->tok_scan_mark[3] == '.')
+ if (sctx->tok_scan_mark[4] == ']')
+ if (sctx->tok_scan_mark[2])
+ return 5;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_equivalence_class(const struct tbnf_scan_ctx * sctx)
+{
+ const char * ch = 0;
+
+ if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap)
+ return -1;
+
+ if (sctx->tok_scan_mark[0] == '[')
+ if (sctx->tok_scan_mark[1] == '=')
+ ch = &sctx->tok_scan_mark[2];
+
+ if (!ch)
+ return -1;
+
+ for (; (*ch >= 'a') && (*ch <= 'z') && (&ch[2] < sctx->tok_scan_cap); )
+ ch++;
+
+ if ((*ch++ == '=') && (*ch++ == ']'))
+ return (ch - sctx->tok_scan_mark);
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_rbracket(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == ']')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_hyphen(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '-')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_bracket_error(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '[')
+ return 1;
+
+ return -1;
+}
+
+
+
+/* bracket state entry scanners */
+static inline int tbnf_regex_scan_lbracket_circumflex_rbracket(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '[')
+ if (&sctx->tok_scan_mark[2] < sctx->tok_scan_cap)
+ if (sctx->tok_scan_mark[1] == '^')
+ if (sctx->tok_scan_mark[2] == ']')
+ return 3;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_lbracket_circumflex(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '[')
+ if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+ if (sctx->tok_scan_mark[1] == '^')
+ return 2;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_lbracket_rbracket(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '[')
+ if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+ if (sctx->tok_scan_mark[1] == ']')
+ return 2;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_lbracket(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '[')
+ return 1;
+
+ return -1;
+}
+
+
+/* common scanners */
+static inline int tbnf_regex_scan_escaped_char(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '\\')
+ if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+ if (sctx->tok_scan_mark[1])
+ return 2;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_circumflex_asterisk(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '^')
+ if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap)
+ if (sctx->tok_scan_mark[1] == '*')
+ return 2;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_circumflex(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '^')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_lbrace(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '{')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_lparen(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '(')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_rparen(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == ')')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_asterisk(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '*')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_period(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '.')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_dollar(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '$')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_vline(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '|')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_qmark(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '?')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_plus(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0] == '+')
+ return 1;
+
+ return -1;
+}
+
+static inline int tbnf_regex_scan_char(const struct tbnf_scan_ctx * sctx)
+{
+ if (sctx->tok_scan_mark[0])
+ return 1;
+
+ return -1;
+}
+
+#endif