Grammar

Lexing

The following table contains lexemes mapped to their corresponding tokens. Keywords currently serve a double purpose in record field names.

Lexeme/Token Table

(*** Scalars ***)

(** fixed-size **)
"void" -> VOID
"bool" -> BOOL

"int8" -> INT8
"int16" -> INT16
"int32" -> INT32
"int64" -> INT64
"int128" -> INT128

"uint8" -> UINT8
"uint16" -> UINT16
"uint32" -> UINT32
"uint64" -> UINT64
"uint128" -> UINT128

(* binary floating point (IEEE 754-2008) *)
"float16" -> FLOAT16
"float32" -> FLOAT32
"float64" -> FLOAT64
"float128" -> FLOAT128

(* complex numbers (IEEE 754-2008) *)
"complex64" ->  COMPLEX64    (* complex<float32> *)
"complex128" -> COMPLEX128   (* complex<float64> *)

(** aliases **)
(* Machine dependent *)
"size" -> SIZE         (* size_t *)
"intptr" -> INTPTR     (* intptr_t *)
"uintptr" -> UINTPTR   (* uintptr_t *)

(* machine independent *)
"int" -> INT           (* int32_t *)
"real" -> REAL         (* float64 *)
"complex" -> COMPLEX   (* complex<float64> *)


(*** Chars, strings, bytes ***)
"char" -> CHAR                 (* utf32 character *)
"string" -> STRING             (* utf8 string *)
"fixed_string" -> FIXED_STRING (* fixed_string[size] or fixed_string[size, encoding] *)
"bytes" -> BYTES               (* bytes[align=2] *)
"fixed_bytes" -> FIXED_BYTES   (* fixed_bytes[size] or fixed_bytes[size, align=2] *)
"align" -> ALIGN               (* keyword for the "bytes" constructor *)


(*** Pointers, option type ***)
"pointer" -> POINTER  (* pointer[datashape] *)
"option" -> OPTION    (* option[datashape] *)


(*** Dimension kinds ***)
"fixed" -> FIXED (* fixed[size] *)
"var" -> VAR     (* var *)


(*** Type kinds ***)
"Any" -> ANY_KIND
"Scalar" -> SCALAR_KIND
"Categorical" -> CATEGORICAL_KIND
"FixedBytes" -> FIXED_BYTES_KIND
"FixedString" -> FIXED_STRING_KIND
"Fixed" -> FIXED_DIM_KIND


(*** Punctuation ***)
"," -> COMMA
":" -> COLON
"(" -> LPAREN
")" -> RPAREN
"{" -> LBRACE
"}" -> RBRACE
"[" -> LBRACK
"]" -> RBRACK
"*" -> STAR
"**" -> DOUBLESTAR
"..." -> ELLIPSIS
"->" -> RARROW
"=" -> EQUAL
"?" -> QUESTIONMARK


(*** Value-carrying tokens ***)
INTEGER    (* natural number *)
NAME_LOWER (* ['a'-'z']['a'-'z' 'A'-'Z' '0'-'9' '_']* *)
NAME_UPPER (* ['A'-'Z']['a'-'z' 'A'-'Z' '0'-'9' '_']* *)
NAME_OTHER (* '_' ['A'-'Z']['a'-'z' 'A'-'Z' '0'-'9' '_']* *)
STRINGLIT  (* only used for encoding arguments: 'utf8' etc.  *)

Encodings

fixed_string and char take encoding arguments, which must be given as string literals. The preferred spelling uses single quotes (here double quotes are used for better syntax highlighting):

"A"    | "ascii" | "us-ascii" -> Ascii
"U8"   | "utf8"  | "utf-8"    -> Utf8
"U16"  | "utf16" | "utf-16"   -> Utf16
"U32"  | "utf32" | "utf-32"   -> Utf32
"ucs2" | "ucs-2" | "ucs_2"    -> Ucs2

Grammar

This is the actual grammar in BNF form:

input:
  datashape EOF

datashape:
  datashape_nooption
| QUESTIONMARK datashape_nooption
| OPTION LBRACK datashape_nooption RBRACK

datashape_nooption:
  (* dimension types *)
  INTEGER STAR datashape
| FIXED LBRACK INTEGER RBRACK STAR datashape
| FIXED_DIM_KIND STAR datashape
| VAR STAR datashape
| NAME_UPPER STAR datashape
| ELLIPSIS STAR datashape
| NAME_UPPER ELLIPSIS STAR datashape

  (* power dimension syntax sugar *)
| INTEGER DOUBLESTAR INTEGER STAR datashape
| VAR DOUBLESTAR INTEGER STAR datashape
| NAME_UPPER DOUBLESTAR INTEGER STAR datashape

| ANY_KIND

| dtype

dtype:
  (***** Scalars *****)

  (*** fixed-size ***)
  VOID
| BOOL

| INT8
| INT16
| INT32
| INT64
| INT128

| UINT8
| UINT16
| UINT32
| UINT64
| UINT128

  (* binary floating point (IEEE 754-2008) *)
| FLOAT16
| FLOAT32
| FLOAT64
| FLOAT128

  (* complex numbers (IEEE 754-2008) *)
| COMPLEX64
| COMPLEX128

  (*** aliases ***)
  (* machine independent *)
| INT
| REAL
| COMPLEX

  (* machine dependent *)
| INTPTR
| UINTPTR
| SIZE

  (*** complex constructor (scalars internally) ***)
  (* complex[float32] *)
| COMPLEX LBRACK FLOAT32 RBRACK
  (* complex[float64] *)
| COMPLEX LBRACK FLOAT64 RBRACK
  (* complex[real] *)
| COMPLEX LBRACK REAL RBRACK

  (***** Chars, strings, bytes *****)
  (* char[encoding] *)
| CHAR LBRACK STRINGLIT RBRACK

  (* alias: unicode character (utf32) *)
| CHAR

  (* unicode string (utf8) *)
| STRING

  (* fixed_string[size] *)
| FIXED_STRING LBRACK INTEGER RBRACK
  (* fixed_string[size, encoding] *)
| FIXED_STRING LBRACK INTEGER COMMA STRINGLIT RBRACK

  (* bytes[align] (target alignment) *)
| BYTES LBRACK ALIGN EQUAL INTEGER RBRACK

  (* fixed_bytes[size, align] (data alignment) *)
| FIXED_BYTES LBRACK INTEGER COMMA ALIGN EQUAL INTEGER RBRACK

  (* pointer[datashape] *)
| POINTER LBRACK datashape RBRACK

  (* dtype variable *)
| NAME_UPPER

  (* dtype kinds *)
| SCALAR_KIND
| CATEGORICAL_KIND
| FIXED_BYTES_KIND
| FIXED_STRING_KIND

| NAME_UPPER LBRACK datashape RBRACK

| tuple_type
| struct_type
| function_type

variadic_flag:
  (* empty *)
| ELLIPSIS

comma_variadic_flag:
  (* empty *)
| COMMA
| COMMA ELLIPSIS

tuple_type:
  LPAREN variadic_flag RPAREN
| LPAREN tuple_item_list comma_variadic_flag RPAREN

tuple_item_list:
  datashape
| tuple_item_list COMMA datashape

struct_type:
  LBRACE variadic_flag RBRACE
| LBRACE struct_field_list comma_variadic_flag RBRACE

struct_field_list:
  struct_field
| struct_field_list COMMA struct_field

struct_field:
  struct_field_name COLON datashape

struct_field_name:
  NAME_LOWER
| NAME_UPPER
| NAME_OTHER
| keyword

function_type:
  tuple_type RARROW datashape
| LPAREN struct_field_list comma_variadic_flag RPAREN RARROW datashape
| LPAREN tuple_item_list COMMA struct_field_list comma_variadic_flag RPAREN
  (* line continued *) RARROW datashape
| LPAREN tuple_item_list COMMA ELLIPSIS COMMA struct_field_list comma_variadic_flag RPAREN
  (* line continued *) RARROW datashape

(* record fields may have keyword names *)
keyword:
  VOID
| BOOL
| INT8
| INT16
| INT32
| INT64
| INT128
| UINT8
| UINT16
| UINT32
| UINT64
| UINT128
| FLOAT16
| FLOAT32
| FLOAT64
| FLOAT128
| COMPLEX64
| COMPLEX128
| INTPTR
| UINTPTR
| SIZE
| REAL
| COMPLEX
| INT
| CHAR
| STRING
| FIXED_STRING
| BYTES
| FIXED_BYTES
| POINTER
| OPTION
| FIXED
| VAR
| ALIGN
| ANY_KIND
| SCALAR_KIND
| CATEGORICAL_KIND
| FIXED_BYTES_KIND
| FIXED_STRING_KIND
| FIXED_DIM_KIND