[ovs-dev] [PATCH ovn 3/3] lex: New lexical analyzer module for use in OVN.

Ben Pfaff blp at nicira.com
Thu Feb 26 05:13:47 UTC 2015


I'm determined not to let the terrible style of pseudo-parsing we have in
OVS leak into OVN.  Here's the first step.

Signed-off-by: Ben Pfaff <blp at nicira.com>
---
 ovn/TODO           |   5 -
 ovn/automake.mk    |   3 +
 ovn/lex.c          | 688 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 ovn/lex.h          | 106 +++++++++
 ovn/ovn.xml        |  41 ++--
 tests/automake.mk  |   6 +-
 tests/ovn.at       |  97 ++++++++
 tests/test-ovn.c   | 112 +++++++++
 tests/testsuite.at |   1 +
 9 files changed, 1033 insertions(+), 26 deletions(-)
 create mode 100644 ovn/lex.c
 create mode 100644 ovn/lex.h
 create mode 100644 tests/ovn.at
 create mode 100644 tests/test-ovn.c

diff --git a/ovn/TODO b/ovn/TODO
index e405c7c..075cb26 100644
--- a/ovn/TODO
+++ b/ovn/TODO
@@ -19,11 +19,6 @@
    Probably should be defined so that the data structure is also
    useful for references to fields in action parsing.
 
-** Lexical analysis.
-
-   Probably should be defined so that the lexer can be reused for
-   parsing actions.
-
 ** Parsing into syntax tree.
 
 ** Semantic checking against variable definitions.
diff --git a/ovn/automake.mk b/ovn/automake.mk
index a4951dc..88847ac 100644
--- a/ovn/automake.mk
+++ b/ovn/automake.mk
@@ -74,4 +74,7 @@ SUFFIXES += .xml
 	$(AM_V_GEN)$(run_python) $(srcdir)/build-aux/xml2nroff \
 		--version=$(VERSION) $< > $@.tmp && mv $@.tmp $@
 
+lib_LTLIBRARIES += lib/libovn.la
+lib_libovn_la_SOURCES = ovn/lex.c ovn/lex.h
+
 EXTRA_DIST += ovn/TODO
diff --git a/ovn/lex.c b/ovn/lex.c
new file mode 100644
index 0000000..22e942a
--- /dev/null
+++ b/ovn/lex.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "lex.h"
+#include <ctype.h>
+#include <errno.h>
+#include <stdarg.h>
+#include "dynamic-string.h"
+#include "json.h"
+#include "util.h"
+
+/* Initializes 'token'. */
+void
+lex_token_init(struct lex_token *token)
+{
+    token->type = LEX_T_END;
+    token->s = NULL;
+}
+
+/* Frees memory owned by 'token'. */
+void
+lex_token_destroy(struct lex_token *token)
+{
+    free(token->s);
+}
+
+/* Exchanges 'a' and 'b'. */
+void
+lex_token_swap(struct lex_token *a, struct lex_token *b)
+{
+    struct lex_token tmp = *a;
+    *a = *b;
+    *b = tmp;
+}
+
+/* lex_token_format(). */
+
+static size_t
+lex_token_n_zeros(enum lex_format format)
+{
+    switch (format) {
+    case LEX_F_DECIMAL:     return offsetof(union mf_subvalue, integer);
+    case LEX_F_HEXADECIMAL: return 0;
+    case LEX_F_IPV4:        return offsetof(union mf_subvalue, ipv4);
+    case LEX_F_IPV6:        return offsetof(union mf_subvalue, ipv6);
+    case LEX_F_ETHERNET:    return offsetof(union mf_subvalue, mac);
+    default: OVS_NOT_REACHED();
+    }
+}
+
+/* Returns the effective format for 'token', that is, the format in which it
+ * should actually be printed.  This is ordinarily the same as 'token->format',
+ * but it's always possible that someone sets up a token with a format that
+ * won't work for a value, e.g. 'token->value' is wider than 32 bits but the
+ * format is LEX_F_IPV4.  (The lexer itself won't do that; this is an attempt
+ * to avoid confusion in the future.) */
+static enum lex_format
+lex_token_get_format(const struct lex_token *token)
+{
+    size_t n_zeros = lex_token_n_zeros(token->format);
+    return (is_all_zeros(&token->value, n_zeros)
+            && (token->type != LEX_T_MASKED_INTEGER
+                || is_all_zeros(&token->mask, n_zeros))
+            ? token->format
+            : LEX_F_HEXADECIMAL);
+}
+
+static void
+lex_token_format_value(const union mf_subvalue *value,
+                       enum lex_format format, struct ds *s)
+{
+    switch (format) {
+    case LEX_F_DECIMAL:
+        ds_put_format(s, "%"PRIu64, ntohll(value->integer));
+        break;
+
+    case LEX_F_HEXADECIMAL:
+        for (const uint8_t *p = value->u8, *end = p + ARRAY_SIZE(value->u8);
+             p < end; p++) {
+            if (*p) {
+                ds_put_format(s, "0x%"PRIx8, *p);
+                for (p++; p < end; p++) {
+                    ds_put_format(s, "%02"PRIx8, *p);
+                }
+                return;
+            }
+        }
+        ds_put_cstr(s, "0");
+        break;
+
+    case LEX_F_IPV4:
+        ds_put_format(s, IP_FMT, IP_ARGS(value->ipv4));
+        break;
+
+    case LEX_F_IPV6:
+        print_ipv6_addr(s, &value->ipv6);
+        break;
+
+    case LEX_F_ETHERNET:
+        ds_put_format(s, ETH_ADDR_FMT, ETH_ADDR_ARGS(value->mac));
+        break;
+
+    default:
+        OVS_NOT_REACHED();
+    }
+
+}
+
+static void
+lex_token_format_masked_integer(const struct lex_token *token, struct ds *s)
+{
+    enum lex_format format = lex_token_get_format(token);
+
+    lex_token_format_value(&token->value, format, s);
+    ds_put_char(s, '/');
+
+    const union mf_subvalue *mask = &token->mask;
+    if (format == LEX_F_IPV4 && ip_is_cidr(mask->ipv4)) {
+        ds_put_format(s, "%d", ip_count_cidr_bits(mask->ipv4));
+    } else if (token->format == LEX_F_IPV6 && ipv6_is_cidr(&mask->ipv6)) {
+        ds_put_format(s, "%d", ipv6_count_cidr_bits(&mask->ipv6));
+    } else {
+        lex_token_format_value(&token->mask, format, s);
+    }
+}
+
+
+static void
+lex_token_format_string(const char *s, struct ds *ds)
+{
+    struct json json;
+    json.type = JSON_STRING;
+    json.u.string = CONST_CAST(char *, s);
+    json_to_ds(&json, 0, ds);
+}
+
+/* Appends a string representation of 'token' to 's', in a format that can be
+ * losslessly parsed back by the lexer.  (LEX_T_END and LEX_T_ERROR can't be
+ * parsed back.) */
+void
+lex_token_format(struct lex_token *token, struct ds *s)
+{
+    switch (token->type) {
+    case LEX_T_END:
+        ds_put_cstr(s, "$");
+        break;
+
+    case LEX_T_ID:
+        ds_put_cstr(s, token->s);
+        break;
+
+    case LEX_T_ERROR:
+        ds_put_cstr(s, "error(");
+        lex_token_format_string(token->s, s);
+        ds_put_char(s, ')');
+        break;
+
+    case LEX_T_STRING:
+        lex_token_format_string(token->s, s);
+        break;
+
+        break;
+
+    case LEX_T_INTEGER:
+        lex_token_format_value(&token->value, lex_token_get_format(token), s);
+        break;
+
+    case LEX_T_MASKED_INTEGER:
+        lex_token_format_masked_integer(token, s);
+        break;
+
+    case LEX_T_LPAREN:
+        ds_put_cstr(s, "(");
+        break;
+    case LEX_T_RPAREN:
+        ds_put_cstr(s, ")");
+        break;
+    case LEX_T_LCURLY:
+        ds_put_cstr(s, "{");
+        break;
+    case LEX_T_RCURLY:
+        ds_put_cstr(s, "}");
+        break;
+    case LEX_T_LSQUARE:
+        ds_put_cstr(s, "[");
+        break;
+    case LEX_T_RSQUARE:
+        ds_put_cstr(s, "]");
+        break;
+    case LEX_T_EQ:
+        ds_put_cstr(s, "==");
+        break;
+    case LEX_T_NE:
+        ds_put_cstr(s, "!=");
+        break;
+    case LEX_T_LT:
+        ds_put_cstr(s, "<");
+        break;
+    case LEX_T_LE:
+        ds_put_cstr(s, "<=");
+        break;
+    case LEX_T_GT:
+        ds_put_cstr(s, ">");
+        break;
+    case LEX_T_GE:
+        ds_put_cstr(s, ">=");
+        break;
+    case LEX_T_LOG_NOT:
+        ds_put_cstr(s, "!");
+        break;
+    case LEX_T_LOG_AND:
+        ds_put_cstr(s, "&&");
+        break;
+    case LEX_T_LOG_OR:
+        ds_put_cstr(s, "||");
+        break;
+    case LEX_T_ELLIPSIS:
+        ds_put_cstr(s, "..");
+        break;
+    case LEX_T_COMMA:
+        ds_put_cstr(s, ",");
+        break;
+    case LEX_T_SEMICOLON:
+        ds_put_cstr(s, ";");
+        break;
+    case LEX_T_EQUALS:
+        ds_put_cstr(s, "=");
+        break;
+    default:
+        OVS_NOT_REACHED();
+    }
+
+}
+
+/* lex_token_parse(). */
+
+static void OVS_PRINTF_FORMAT(2, 3)
+lex_error(struct lex_token *token, const char *message, ...)
+{
+    token->type = LEX_T_ERROR;
+
+    va_list args;
+    va_start(args, message);
+    token->s = xvasprintf(message, args);
+    va_end(args);
+}
+
+static void
+lex_parse_hex_integer(const char *start, size_t len, struct lex_token *token)
+{
+    const char *in = start + (len - 1);
+    uint8_t *out = token->value.u8 + (sizeof token->value.u8 - 1);
+
+    for (int i = 0; i < len; i++) {
+        int hexit = hexit_value(in[-i]);
+        if (hexit < 0) {
+            lex_error(token, "Invalid syntax in hexadecimal constant.");
+            return;
+        }
+        if (hexit && i / 2 >= sizeof token->value.u8) {
+            lex_error(token, "Hexadecimal constant requires more than "
+                      "%"PRIuSIZE" bits.", 8 * sizeof token->value.u8);
+            return;
+        }
+        out[-(i / 2)] |= i % 2 ? hexit << 4 : hexit;
+    }
+    token->format = LEX_F_HEXADECIMAL;
+}
+
+static const char *
+lex_parse_integer__(const char *p, struct lex_token *token)
+{
+    const char *start = p;
+    const char *end = p + strspn(p, "0123456789abcdefABCDEFxX.:");
+    size_t len = end - start;
+
+    int n;
+    uint8_t mac[ETH_ADDR_LEN];
+
+    token->type = LEX_T_INTEGER;
+    if (!len) {
+        lex_error(token, "Integer constant expected.");
+    } else if (len == 17
+               && ovs_scan(start, ETH_ADDR_SCAN_FMT"%n",
+                           ETH_ADDR_SCAN_ARGS(mac), &n)
+               && n == len) {
+        memcpy(token->value.mac, mac, sizeof token->value.mac);
+        token->format = LEX_F_ETHERNET;
+    } else if (start + strspn(start, "0123456789") == end) {
+        if (p[0] == '0' && len > 1) {
+            lex_error(token, "Decimal constants must not have leading zeros.");
+        } else {
+            unsigned long long int integer;
+            char *tail;
+
+            errno = 0;
+            integer = strtoull(p, &tail, 10);
+            if (tail != end || errno == ERANGE) {
+                lex_error(token, "Decimal constants must be less than 2**64.");
+            } else {
+                token->value.integer = htonll(integer);
+                token->format = LEX_F_DECIMAL;
+            }
+        }
+    } else if (p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
+        if (len > 2) {
+            lex_parse_hex_integer(start + 2, len - 2, token);
+        } else {
+            lex_error(token, "Hex digits expected following 0%c.", p[1]);
+        }
+    } else if (len < INET6_ADDRSTRLEN) {
+        char copy[INET6_ADDRSTRLEN];
+        memcpy(copy, p, len);
+        copy[len] = '\0';
+
+        struct in_addr ipv4;
+        struct in6_addr ipv6;
+        if (inet_pton(AF_INET, copy, &ipv4) == 1) {
+            token->value.ipv4 = ipv4.s_addr;
+            token->format = LEX_F_IPV4;
+        } else if (inet_pton(AF_INET6, copy, &ipv6) == 1) {
+            token->value.ipv6 = ipv6;
+            token->format = LEX_F_IPV6;
+        } else {
+            lex_error(token, "Invalid numeric constant.");
+        }
+    } else {
+        lex_error(token, "Invalid numeric constant.");
+    }
+
+    ovs_assert(token->type == LEX_T_INTEGER || token->type == LEX_T_ERROR);
+    return end;
+}
+
+static const char *
+lex_parse_integer(const char *p, struct lex_token *token)
+{
+    memset(&token->value, 0, sizeof token->value);
+    p = lex_parse_integer__(p, token);
+    if (token->type == LEX_T_INTEGER && *p == '/') {
+        struct lex_token mask;
+
+        lex_token_init(&mask);
+        memset(&mask.value, 0, sizeof mask.value);
+        p = lex_parse_integer__(p + 1, &mask);
+        if (mask.type == LEX_T_INTEGER) {
+            token->type = LEX_T_MASKED_INTEGER;
+
+            uint32_t prefix_bits = ntohll(mask.value.integer);
+            if (token->format == mask.format) {
+                /* Same format value and mask is always OK. */
+                token->mask = mask.value;
+            } else if (token->format == LEX_F_IPV4
+                       && mask.format == LEX_F_DECIMAL
+                       && prefix_bits <= 32) {
+                /* IPv4 address with decimal mask is a CIDR prefix. */
+                token->mask.integer = htonll(ntohl(be32_prefix_mask(
+                                                       prefix_bits)));
+            } else if (token->format == LEX_F_IPV6
+                       && mask.format == LEX_F_DECIMAL
+                       && prefix_bits <= 128) {
+                /* IPv6 address with decimal mask is a CIDR prefix. */
+                token->mask.ipv6 = ipv6_create_mask(prefix_bits);
+            } else if (token->format == LEX_F_DECIMAL
+                       && mask.format == LEX_F_HEXADECIMAL
+                       && token->value.integer == 0) {
+                /* Special case for e.g. 0/0x1234. */
+                token->format = LEX_F_HEXADECIMAL;
+                token->mask = mask.value;
+            } else {
+                lex_error(token, "Value and mask have incompatible formats.");
+                return p;
+            }
+
+            for (int i = 0; i < ARRAY_SIZE(token->mask.be32); i++) {
+                ovs_be32 v = token->value.be32[i];
+                ovs_be32 m = token->mask.be32[i];
+
+                if (v & ~m) {
+                    lex_error(token, "Value contains unmasked 1-bits.");
+                    break;
+                }
+            }
+
+            return p;
+        } else {
+            lex_token_swap(&mask, token);
+        }
+        lex_token_destroy(&mask);
+    }
+    return p;
+}
+
+static const char *
+lex_parse_string(const char *p, struct lex_token *token)
+{
+    const char *start = ++p;
+    for (;;) {
+        switch (*p) {
+        case '\0':
+            lex_error(token, "Input ends inside quoted string.");
+            return p;
+
+        case '"':
+            token->type = (json_string_unescape(start, p - start, &token->s)
+                           ? LEX_T_STRING : LEX_T_ERROR);
+            return p + 1;
+
+        case '\\':
+            p++;
+            if (*p) {
+                p++;
+            }
+            break;
+
+        default:
+            p++;
+            break;
+        }
+    }
+
+}
+
+static bool
+lex_is_id1(unsigned char c)
+{
+    return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
+            || c == '_' || c == '.');
+}
+
+static bool
+lex_is_idn(unsigned char c)
+{
+    return lex_is_id1(c) || (c >= '0' && c <= '9');
+}
+
+static const char *
+lex_parse_id(const char *p, struct lex_token *token)
+{
+    const char *start = p;
+
+    do {
+        p++;
+    } while (lex_is_idn(*p));
+
+    token->type = LEX_T_ID;
+    token->s = xmemdup0(start, p - start);
+    return p;
+}
+
+/* Initializes 'token' and parses the first token from the beginning of
+ * null-terminated string 'p' into 'token'.  Returns the character position
+ * at which to begin parsing the next token. */
+const char *
+lex_token_parse(struct lex_token *token, const char *p)
+{
+    lex_token_init(token);
+
+next:
+    switch (*p) {
+    case '\0':
+        token->type = LEX_T_END;
+        return p;
+
+    case ' ': case '\t': case '\n': case '\r':
+        p++;
+        goto next;
+
+    case '/':
+        p++;
+        if (*p == '/') {
+            do {
+                p++;
+            } while (*p != '\0' && *p != '\n');
+            goto next;
+        } else if (*p == '*') {
+            p++;
+            for (;;) {
+                if (*p == '*' && p[1] == '/') {
+                    p += 2;
+                    goto next;
+                } else if (*p == '\0' || *p == '\n') {
+                    lex_error(token, "`/*' without matching `*/'.");
+                    return p;
+                } else {
+                    p++;
+                }
+            }
+            goto next;
+        } else {
+            lex_error(token,
+                      "`/' is only valid as part of `//' or `/*'.");
+        }
+        break;
+
+    case '(':
+        token->type = LEX_T_LPAREN;
+        p++;
+        break;
+
+    case ')':
+        token->type = LEX_T_RPAREN;
+        p++;
+        break;
+
+    case '{':
+        token->type = LEX_T_LCURLY;
+        p++;
+        break;
+
+    case '}':
+        token->type = LEX_T_RCURLY;
+        p++;
+        break;
+
+    case '[':
+        token->type = LEX_T_LSQUARE;
+        p++;
+        break;
+
+    case ']':
+        token->type = LEX_T_RSQUARE;
+        p++;
+        break;
+
+    case '=':
+        p++;
+        if (*p == '=') {
+            token->type = LEX_T_EQ;
+            p++;
+        } else {
+            token->type = LEX_T_EQUALS;
+        }
+        break;
+
+    case '!':
+        p++;
+        if (*p == '=') {
+            token->type = LEX_T_NE;
+            p++;
+        } else {
+            token->type = LEX_T_LOG_NOT;
+        }
+        break;
+
+    case '&':
+        p++;
+        if (*p == '&') {
+            token->type = LEX_T_LOG_AND;
+            p++;
+        } else {
+            lex_error(token, "`&' is only valid as part of `&&'.");
+        }
+        break;
+
+    case '|':
+        p++;
+        if (*p == '|') {
+            token->type = LEX_T_LOG_OR;
+            p++;
+        } else {
+            lex_error(token, "`|' is only valid as part of `||'.");
+        }
+        break;
+
+    case '<':
+        p++;
+        if (*p == '=') {
+            token->type = LEX_T_LE;
+            p++;
+        } else {
+            token->type = LEX_T_LT;
+        }
+        break;
+
+    case '>':
+        p++;
+        if (*p == '=') {
+            token->type = LEX_T_GE;
+            p++;
+        } else {
+            token->type = LEX_T_GT;
+        }
+        break;
+
+    case '.':
+        p++;
+        if (*p == '.') {
+            token->type = LEX_T_ELLIPSIS;
+            p++;
+        } else {
+            lex_error(token, "`.' is only valid as part of `..' "
+                      "or a numeric constant.");
+        }
+        break;
+
+    case ',':
+        p++;
+        token->type = LEX_T_COMMA;
+        break;
+
+    case ';':
+        p++;
+        token->type = LEX_T_SEMICOLON;
+        break;
+
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9':
+    case ':':
+        p = lex_parse_integer(p, token);
+        break;
+
+    case '"':
+        p = lex_parse_string(p, token);
+        break;
+
+    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+        /* We need to distinguish an Ethernet address or IPv6 address from an
+         * identifier.  Fortunately, Ethernet addresses, and IPv6 addresses
+         * that ambiguous based on the first character, always start with
+         * hex digits followed by a colon, but identifiers never do. */
+        p = (p[strspn(p, "0123456789abcdefABCDEF")] == ':'
+             ? lex_parse_integer(p, token)
+             : lex_parse_id(p, token));
+        break;
+
+    default:
+        if (lex_is_id1(*p)) {
+            p = lex_parse_id(p, token);
+        } else {
+            if (isprint((unsigned char) *p)) {
+                lex_error(token, "Invalid character `%c' in input.", *p);
+            } else {
+                lex_error(token, "Invalid byte 0x%d in input.", *p);
+            }
+            p++;
+        }
+        break;
+    }
+
+    return p;
+}
+
+/* Initializes 'lexer' for parsing 'input'.
+ *
+ * While the lexer is in use, 'input' must remain available, but the caller
+ * otherwise retains ownership of 'input'.
+ *
+ * The caller must call lexer_get() to obtain the first token. */
+void
+lexer_init(struct lexer *lexer, const char *input)
+{
+    lexer->input = input;
+    memset(&lexer->token, 0, sizeof lexer->token);
+}
+
+/* Frees storage associated with 'lexer'. */
+void
+lexer_destroy(struct lexer *lexer)
+{
+    lex_token_destroy(&lexer->token);
+}
+
+/* Obtains the next token from 'lexer' into 'lexer->token', and returns the
+ * token's type.  The caller may examine 'lexer->token' directly to obtain full
+ * information about the token. */
+enum lex_type
+lexer_get(struct lexer *lexer)
+{
+    lex_token_destroy(&lexer->token);
+    lexer->input = lex_token_parse(&lexer->token, lexer->input);
+    return lexer->token.type;
+}
diff --git a/ovn/lex.h b/ovn/lex.h
new file mode 100644
index 0000000..b035c65
--- /dev/null
+++ b/ovn/lex.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef OVN_LEX_H
+#define OVN_LEX_H 1
+
+/* OVN lexical analyzer
+ * ====================
+ *
+ * This is a simple lexical analyzer (or tokenizer) for OVN match expressions
+ * and ACLs. */
+
+#include "meta-flow.h"
+
+struct ds;
+
+/* Token type. */
+enum lex_type {
+    LEX_T_END,                  /* end of input */
+
+    /* Tokens with auxiliary data. */
+    LEX_T_ID,                   /* foo */
+    LEX_T_STRING,               /* "foo" */
+    LEX_T_INTEGER,              /* 12345 or 1.2.3.4 or ::1 or 01:02:03:04:05 */
+    LEX_T_MASKED_INTEGER,       /* 12345/10 or 1.2.0.0/16 or ::2/127 or... */
+    LEX_T_ERROR,                /* invalid input */
+
+    /* Bare tokens. */
+    LEX_T_LPAREN,               /* ( */
+    LEX_T_RPAREN,               /* ) */
+    LEX_T_LCURLY,               /* { */
+    LEX_T_RCURLY,               /* } */
+    LEX_T_LSQUARE,              /* [ */
+    LEX_T_RSQUARE,              /* ] */
+    LEX_T_EQ,                   /* == */
+    LEX_T_NE,                   /* != */
+    LEX_T_LT,                   /* < */
+    LEX_T_LE,                   /* <= */
+    LEX_T_GT,                   /* > */
+    LEX_T_GE,                   /* >= */
+    LEX_T_LOG_NOT,              /* ! */
+    LEX_T_LOG_AND,              /* && */
+    LEX_T_LOG_OR,               /* || */
+    LEX_T_ELLIPSIS,             /* .. */
+    LEX_T_COMMA,                /* , */
+    LEX_T_SEMICOLON,            /* ; */
+    LEX_T_EQUALS,               /* = */
+};
+
+/* Subtype for LEX_T_INTEGER and LEX_T_MASKED_INTEGER tokens.
+ *
+ * These do not change the semantics of a token; instead, they determine the
+ * format used when a token is serialized back to a text form.  That's
+ * important because 3232268289 is meaningless to a human whereas 192.168.128.1
+ * has some actual significance. */
+enum lex_format {
+    LEX_F_DECIMAL,
+    LEX_F_HEXADECIMAL,
+    LEX_F_IPV4,
+    LEX_F_IPV6,
+    LEX_F_ETHERNET,
+};
+
+/* A token.
+ *
+ * 's' is owned by the token. */
+struct lex_token {
+    enum lex_type type;         /* One of LEX_*. */
+    char *s;                    /* LEX_T_ID, LEX_T_STRING, LEX_T_ERROR only. */
+    enum lex_format format;     /* LEX_T_INTEGER, LEX_T_MASKED_INTEGER only. */
+    union mf_subvalue value;    /* LEX_T_INTEGER, LEX_T_MASKED_INTEGER only. */
+    union mf_subvalue mask;     /* LEX_T_MASKED_INTEGER only. */
+};
+
+void lex_token_init(struct lex_token *);
+void lex_token_destroy(struct lex_token *);
+void lex_token_swap(struct lex_token *, struct lex_token *);
+
+void lex_token_format(struct lex_token *, struct ds *);
+const char *lex_token_parse(struct lex_token *, const char *input);
+
+/* A lexical analyzer. */
+struct lexer {
+    const char *input;          /* Remaining input (not owned by lexer). */
+    struct lex_token token;     /* Current token (owned by lexer). */
+};
+
+void lexer_init(struct lexer *, const char *input);
+void lexer_destroy(struct lexer *);
+
+enum lex_type lexer_get(struct lexer *);
+
+#endif /* ovn/lex.h */
diff --git a/ovn/ovn.xml b/ovn/ovn.xml
index a233112..e7fbb5f 100644
--- a/ovn/ovn.xml
+++ b/ovn/ovn.xml
@@ -278,9 +278,11 @@
       </p>
 
       <p>
-        The <code>inport</code> and <code>outport</code> fields have string
-        values.  The useful values are <ref column="logical_port"/> names from
-        the <ref column="Bindings"/> and <ref column="Gateway"/> table.
+        The <code>inport</code> and <code>outport</code> fields have quoted
+        string values.  Quoted strings have the same syntax as quoted strings
+        in JSON (thus, they are Unicode strings).  The useful values are <ref
+        column="logical_port"/> names from the <ref column="Bindings"/> and
+        <ref column="Gateway"/> table.
       </p>
 
       <p>
@@ -289,7 +291,7 @@
 
       <ul>
         <li><code>()</code></li>
-        <li><code>==   !=   &lt;   &lt;=   &gt;   &gt;=   in   not in</code></li>
+        <li><code>==   !=   &lt;   &lt;=   &gt;   &gt;=</code></li>
         <li><code>!</code></li>
         <li><code>&amp;&amp;</code></li>
         <li><code>||</code></li>
@@ -315,27 +317,28 @@
       </p>
 
       <p>
-        The relational operators are &lt;, &lt;=, &gt;, and &gt;=.  Their
-        operands must be a field and a constant, in either order; the constant
-        must not be masked.  These operators are most commonly useful for L4
-        ports, e.g. <code>tcp.src &lt; 1024</code>.  Implementation of the
-        relational operators is expensive.
-      </p>
-
-      <p>
-        The set membership operator <code>in</code>, with syntax
-        ``<code><var>field</var> in { <var>constant1</var>,
-        <var>constant2</var>,</code> ... <code>}</code>'', is syntactic sugar
-        for ``<code>(<var>field</var> == <var>constant1</var> ||
+        The <code>==</code> operator also serves as a set membership operator,
+        with syntax ``<code><var>field</var> == { <var>constant1</var>,
+        <var>constant2</var>,</code> ... <code>}</code>'' acting as syntactic
+        sugar for ``<code>(<var>field</var> == <var>constant1</var> ||
         <var>field</var> == <var>constant2</var> || </code>...<code>)</code>.
-        Conversely, ``<code><var>field</var> not in { <var>constant1</var>,
-        <var>constant2</var>, </code>...<code> }</code>'' is syntactic sugar
-        for ``<code>(<var>field</var> != <var>constant1</var> &amp;&amp;
+        Similarly, ``<code><var>field</var> != { <var>constant1</var>,
+        <var>constant2</var>, </code>...<code> }</code>'' is equivalent to
+        ``<code>(<var>field</var> != <var>constant1</var> &amp;&amp;
         <var>field</var> != <var>constant2</var> &amp;&amp;
         </code>...<code>)</code>''.
       </p>
 
       <p>
+        The relational operators are &lt;, &lt;=, &gt;, and &gt;=.  Their
+        operands must be a field and a constant, in either order.  These
+        operators are most commonly useful for L4 ports, e.g. <code>tcp.src
+        &lt; 1024</code>.  The constant operand must not be masked, but the
+        field reference may use subfield syntax, e.g. <code>vlan.tci[12..15] >
+        1</code>.  Implementation of the relational operators is expensive.
+      </p>
+
+      <p>
         The unary prefix operator <code>!</code> yields its operand's inverse.
       </p>
 
diff --git a/tests/automake.mk b/tests/automake.mk
index 50d8ad2..949454c 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -80,7 +80,8 @@ TESTSUITE_AT = \
 	tests/rstp.at \
 	tests/interface-reconfigure.at \
 	tests/vlog.at \
-	tests/vtep-ctl.at
+	tests/vtep-ctl.at \
+	tests/ovn.at
 
 KMOD_TESTSUITE_AT = \
 	tests/kmod-testsuite.at \
@@ -271,6 +272,7 @@ tests_ovstest_SOURCES = \
 	tests/test-multipath.c \
 	tests/test-netflow.c \
 	tests/test-odp.c \
+	tests/test-ovn.c \
 	tests/test-packets.c \
 	tests/test-random.c \
 	tests/test-reconnect.c \
@@ -288,7 +290,7 @@ tests_ovstest_SOURCES += \
 	tests/test-unix-socket.c
 endif
 
-tests_ovstest_LDADD = lib/libopenvswitch.la
+tests_ovstest_LDADD = lib/libopenvswitch.la lib/libovn.la
 dist_check_SCRIPTS = tests/flowgen.pl
 
 noinst_PROGRAMS += tests/test-strtok_r
diff --git a/tests/ovn.at b/tests/ovn.at
new file mode 100644
index 0000000..69a5d96
--- /dev/null
+++ b/tests/ovn.at
@@ -0,0 +1,97 @@
+AT_BANNER([OVN])
+
+AT_SETUP([ovn -- lexer])
+dnl OVN lexer test cases.
+dnl For lines without =>, input and expected output are identical.
+dnl For lines without =>, input precedes <= and expected output follows =>.
+AT_DATA([test-cases.txt], [dnl
+foo bar baz quuxquuxquux _abcd_ a.b.c.d a123_.456
+"abc\u0020def" => "abc def"
+" => error("Input ends inside quoted string.")dnl "
+
+a/*b*/c => a c
+a//b c => a
+a/**/b => a b
+a/*/b => a error("`/*' without matching `*/'.")
+a/*/**/b => a b
+a/b => a error("`/' is only valid as part of `//' or `/*'.") b
+
+0 1 12345 18446744073709551615
+18446744073709551616 => error("Decimal constants must be less than 2**64.")
+9999999999999999999999 => error("Decimal constants must be less than 2**64.")
+01 => error("Decimal constants must not have leading zeros.")
+
+0/0
+0/1
+1/0 => error("Value contains unmasked 1-bits.")
+1/1
+128/384
+1/3
+1/ => error("Integer constant expected.")
+
+1/0x123 => error("Value and mask have incompatible formats.")
+
+0x1234
+0x01234 => 0x1234
+0x0 => 0
+0x000 => 0
+0xfedcba9876543210
+0XFEDCBA9876543210 => 0xfedcba9876543210
+0xfedcba9876543210fedcba9876543210
+0xfedcba9876543210fedcba98765432100 => error("Hexadecimal constant requires more than 128 bits.")
+0x0000fedcba9876543210fedcba9876543210 => 0xfedcba9876543210fedcba9876543210
+0x => error("Hex digits expected following 0x.")
+0X => error("Hex digits expected following 0X.")
+0x0/0x0 => 0/0
+0x0/0x1 => 0/0x1
+0x1/0x0 => error("Value contains unmasked 1-bits.")
+0xffff/0x1ffff
+0x. => error("Invalid syntax in hexadecimal constant.")
+
+192.168.128.1 1.2.3.4 255.255.255.255 0.0.0.0
+256.1.2.3 => error("Invalid numeric constant.")
+192.168.0.0/16
+192.168.0.0/255.255.0.0 => 192.168.0.0/16
+192.168.0.0/255.255.255.0 => 192.168.0.0/24
+192.168.0.0/255.255.0.255
+192.168.0.0/255.0.0.0 => error("Value contains unmasked 1-bits.")
+192.168.0.0/32
+192.168.0.0/255.255.255.255 => 192.168.0.0/32
+
+::
+::1
+ff00::1234 => ff00::1234
+2001:db8:85a3::8a2e:370:7334
+2001:db8:85a3:0:0:8a2e:370:7334 => 2001:db8:85a3::8a2e:370:7334
+2001:0db8:85a3:0000:0000:8a2e:0370:7334 => 2001:db8:85a3::8a2e:370:7334
+::ffff:192.0.2.128
+::ffff:c000:0280 => ::ffff:192.0.2.128
+::1/::1
+::1/ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff => ::1/128
+::1/128
+ff00::/8
+ff00::/ff00:: => ff00::/8
+
+01:23:45:67:ab:cd
+01:23:45:67:AB:CD => 01:23:45:67:ab:cd
+fe:dc:ba:98:76:54
+FE:DC:ba:98:76:54 => fe:dc:ba:98:76:54
+01:00:00:00:00:00/01:00:00:00:00:00
+ff:ff:ff:ff:ff:ff/ff:ff:ff:ff:ff:ff
+fe:ff:ff:ff:ff:ff/ff:ff:ff:ff:ff:ff
+ff:ff:ff:ff:ff:ff/fe:ff:ff:ff:ff:ff => error("Value contains unmasked 1-bits.")
+fe:x => error("Invalid numeric constant.")
+00:01:02:03:04:x => error("Invalid numeric constant.")
+
+(){}[[]]==!=<<=>>=!&&||..,;= => ( ) { } [[ ]] == != < <= > >= ! && || .. , ; =
+& => error("`&' is only valid as part of `&&'.")
+| => error("`|' is only valid as part of `||'.")
+. => error("`.' is only valid as part of `..' or a numeric constant.")
+
+^ => error("Invalid character `^' in input.")
+])
+AT_CAPTURE_FILE([input.txt])
+sed 's/ =>.*//' test-cases.txt > input.txt
+sed 's/.* => //' test-cases.txt > expout
+AT_CHECK([ovstest test-ovn lex < input.txt], [0], [expout])
+AT_CLEANUP
diff --git a/tests/test-ovn.c b/tests/test-ovn.c
new file mode 100644
index 0000000..a4fd23f
--- /dev/null
+++ b/tests/test-ovn.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "command-line.h"
+#include "dynamic-string.h"
+#include "ovn/lex.h"
+#include "ovstest.h"
+#include "util.h"
+
+static void
+compare_token(const struct lex_token *a, const struct lex_token *b)
+{
+    if (a->type != b->type) {
+        fprintf(stderr, "type differs: %d -> %d\n", a->type, b->type);
+        return;
+    }
+
+    if (!((a->s && b->s && !strcmp(a->s, b->s))
+          || (!a->s && !b->s))) {
+        fprintf(stderr, "string differs: %s -> %s\n",
+                a->s ? a->s : "(null)",
+                b->s ? b->s : "(null)");
+        return;
+    }
+
+    if (a->type == LEX_T_INTEGER || a->type == LEX_T_MASKED_INTEGER) {
+        if (memcmp(&a->value, &b->value, sizeof a->value)) {
+            fprintf(stderr, "value differs\n");
+            return;
+        }
+
+        if (a->type == LEX_T_MASKED_INTEGER
+            && memcmp(&a->mask, &b->mask, sizeof a->mask)) {
+            fprintf(stderr, "mask differs\n");
+            return;
+        }
+    }
+
+    if (a->format != b->format
+        && !(a->format == LEX_F_HEXADECIMAL
+             && b->format == LEX_F_DECIMAL
+             && a->value.integer == 0)) {
+        fprintf(stderr, "format differs: %d -> %d\n", a->format, b->format);
+    }
+}
+
+static void
+test_lex(int argc OVS_UNUSED, char *argv[] OVS_UNUSED)
+{
+    struct ds input;
+    struct ds output;
+
+    ds_init(&input);
+    ds_init(&output);
+    while (!ds_get_line(&input, stdin)) {
+        struct lexer lexer;
+
+        lexer_init(&lexer, ds_cstr(&input));
+        ds_clear(&output);
+        while (lexer_get(&lexer) != LEX_T_END) {
+            size_t len = output.length;
+            lex_token_format(&lexer.token, &output);
+
+            /* Check that the formatted version can really be parsed back
+             * losslessly. */
+            if (lexer.token.type != LEX_T_ERROR) {
+                const char *s = ds_cstr(&output) + len;
+                struct lexer l2;
+
+                lexer_init(&l2, s);
+                lexer_get(&l2);
+                compare_token(&lexer.token, &l2.token);
+                lexer_destroy(&l2);
+            }
+            ds_put_char(&output, ' ');
+        }
+        lexer_destroy(&lexer);
+
+        ds_chomp(&output, ' ');
+        puts(ds_cstr(&output));
+    }
+    ds_destroy(&input);
+    ds_destroy(&output);
+}
+
+static const struct command commands[] = {
+    {"lex", NULL, 0, 0, test_lex},
+    {NULL, NULL, 0, 0, NULL},
+};
+
+static void
+test_ovn_main(int argc, char *argv[])
+{
+    set_program_name(argv[0]);
+    run_command(argc - 1, argv + 1, commands);
+}
+
+OVSTEST_REGISTER("test-ovn", test_ovn_main);
diff --git a/tests/testsuite.at b/tests/testsuite.at
index cd7f455..ba1f5bb 100644
--- a/tests/testsuite.at
+++ b/tests/testsuite.at
@@ -65,3 +65,4 @@ m4_include([tests/stp.at])
 m4_include([tests/rstp.at])
 m4_include([tests/vlog.at])
 m4_include([tests/vtep-ctl.at])
+m4_include([tests/ovn.at])
-- 
2.1.3




More information about the dev mailing list