diff --git a/c/examples/json_struct_metadata.c b/c/examples/json_struct_metadata.c new file mode 100644 index 0000000000..f917b1aa42 --- /dev/null +++ b/c/examples/json_struct_metadata.c @@ -0,0 +1,159 @@ +#include +#include +#include +#include +#include + +// these are properties of the ``json+struct`` codec, documented in tskit +#define JSON_STRUCT_HEADER_SIZE 21 + +const uint8_t json_struct_codec_magic[4] = { 'J', 'B', 'L', 'B' }; +const uint8_t json_struct_codec_version = 1; + +// little-endian read of a uint64_t from an address +static uint64_t +load_u64_le(const uint8_t *p) +{ + uint64_t value = (uint64_t) p[0]; + value |= (uint64_t) p[1] << 8; + value |= (uint64_t) p[2] << 16; + value |= (uint64_t) p[3] << 24; + value |= (uint64_t) p[4] << 32; + value |= (uint64_t) p[5] << 40; + value |= (uint64_t) p[6] << 48; + value |= (uint64_t) p[7] << 56; + return value; +} + +// little-endian write of a uint64_t to an address +static void +set_u64_le(uint8_t *dest, uint64_t value) +{ + dest[0] = (uint8_t) (value & 0xFF); + dest[1] = (uint8_t) ((value >> 8) & 0xFF); + dest[2] = (uint8_t) ((value >> 16) & 0xFF); + dest[3] = (uint8_t) ((value >> 24) & 0xFF); + dest[4] = (uint8_t) ((value >> 32) & 0xFF); + dest[5] = (uint8_t) ((value >> 40) & 0xFF); + dest[6] = (uint8_t) ((value >> 48) & 0xFF); + dest[7] = (uint8_t) ((value >> 56) & 0xFF); +} + +// Extract the json and binary payloads from the `json+struct` codec data buffer. +// Note that the output pointers `json` and `binary` reference memory +// inside the `metadata` buffer passed in. +void +json_struct_codec_get_components(uint8_t *metadata, tsk_size_t metadata_length, + uint8_t **json, tsk_size_t *json_length, uint8_t **binary, tsk_size_t *binary_length) +{ + // check the structure of the codec header and the sizes it specifies + if (metadata == NULL || json == NULL || json_length == NULL || binary == NULL + || binary_length == NULL) + errx(EXIT_FAILURE, "bad parameter value."); + if (metadata_length < JSON_STRUCT_HEADER_SIZE) + errx(EXIT_FAILURE, "metadata truncated."); + if (memcmp(metadata, json_struct_codec_magic, sizeof(json_struct_codec_magic)) != 0) + errx(EXIT_FAILURE, "bad magic bytes."); + + uint8_t version = metadata[4]; + if (version != json_struct_codec_version) + errx(EXIT_FAILURE, "bad version number."); + + uint64_t json_length_u64 = load_u64_le(metadata + 5); + uint64_t binary_length_u64 = load_u64_le(metadata + 13); + if (json_length_u64 > UINT64_MAX - (uint64_t) JSON_STRUCT_HEADER_SIZE) + errx(EXIT_FAILURE, "invalid length."); + + // determine the number of padding bytes and do more safety checks + uint64_t length = (uint64_t) JSON_STRUCT_HEADER_SIZE + json_length_u64; + uint64_t padding_length = (8 - (length & 0x07)) % 8; + if (padding_length > UINT64_MAX - length) + errx(EXIT_FAILURE, "invalid length."); + + length += padding_length; + if (binary_length_u64 > UINT64_MAX - length) + errx(EXIT_FAILURE, "invalid length."); + + length += binary_length_u64; + if ((uint64_t) metadata_length != length) + errx(EXIT_FAILURE, "unexpected size."); + + uint8_t *padding_start = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64; + for (uint64_t j = 0; j < padding_length; ++j) + if (*(padding_start + j) != 0) + errx(EXIT_FAILURE, "padding bytes are nonzero."); + + // the structure of the codec data seems valid; return components + *json = metadata + JSON_STRUCT_HEADER_SIZE; + *json_length = (tsk_size_t) json_length_u64; + + *binary = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64 + padding_length; + *binary_length = (tsk_size_t) binary_length_u64; +} + +// malloc and return a data buffer for the `json+struct` codec +// that contains the given components +void +json_struct_codec_create_buffer(const uint8_t *json, tsk_size_t json_length, + const uint8_t *binary, tsk_size_t binary_length, uint8_t **buffer, + tsk_size_t *buffer_length) +{ + // figure out the total length of the codec's data and allocate the buffer for it + tsk_size_t header_length = JSON_STRUCT_HEADER_SIZE; + tsk_size_t padding_length = (8 - ((header_length + json_length) & 0x07)) % 8; + tsk_size_t total_length + = header_length + json_length + padding_length + binary_length; + uint8_t *bytes = malloc(total_length); + if (!bytes) + errx(EXIT_FAILURE, "memory for buffer could not be allocated."); + + // then set up the bytes for the codec header + memcpy(bytes, json_struct_codec_magic, 4); + bytes[4] = json_struct_codec_version; + set_u64_le(bytes + 5, (uint64_t) json_length); + set_u64_le(bytes + 13, (uint64_t) binary_length); + + // copy in the JSON and binary data, separated by the padding bytes; the goal of the + // padding bytes is to ensure that the binary data is 8-byte-aligned relative to the + // start of the buffer + memcpy(bytes + header_length, json, json_length); + memset(bytes + header_length + json_length, 0, padding_length); + memcpy(bytes + header_length + json_length + padding_length, binary, binary_length); + + // return the buffer and its length; the caller takes ownership of the buffer + *buffer = bytes; + *buffer_length = total_length; +} + +int +main(int argc, char **argv) +{ + // we start with JSON and binary payloads that we encode into a new buffer + // note that the JSON payload does not have to end with a trailing NULL + const char json_payload[] = { '{', '"', 'a', '"', ':', '1', '}' }; + const uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 }; + uint8_t *metadata; + tsk_size_t metadata_length; + + json_struct_codec_create_buffer((const uint8_t *) json_payload, sizeof(json_payload), + binary_payload, sizeof(binary_payload), &metadata, &metadata_length); + + // then we decode that buffer to recover the json and binary data + uint8_t *decoded_json, *decoded_binary; + tsk_size_t decoded_json_length, decoded_binary_length; + + json_struct_codec_get_components(metadata, metadata_length, &decoded_json, + &decoded_json_length, &decoded_binary, &decoded_binary_length); + + // print the recovered data to demonstrate that the round-trip worked + // note that the JSON data is not NULL-terminated unless you put a NULL there! + printf("JSON: %.*s\n", (int) decoded_json_length, decoded_json); + + printf("Binary data:"); + for (tsk_size_t j = 0; j < decoded_binary_length; j++) + printf(" %#04x", decoded_binary[j]); + printf("\n"); + + free(metadata); + return EXIT_SUCCESS; +} diff --git a/c/meson.build b/c/meson.build index f5c1a0f585..fb89c06907 100644 --- a/c/meson.build +++ b/c/meson.build @@ -125,6 +125,9 @@ if not meson.is_subproject() executable('multichrom_wright_fisher_singlethreaded', sources: ['examples/multichrom_wright_fisher_singlethreaded.c'], link_with: [tskit_lib], dependencies: lib_deps) + executable('json_struct_metadata', + sources: ['examples/json_struct_metadata.c'], + link_with: [tskit_lib], dependencies: lib_deps) thread_dep = dependency('threads') executable('multichrom_wright_fisher', diff --git a/docs/c-api.rst b/docs/c-api.rst index 228ea3fb7c..e19027cd31 100644 --- a/docs/c-api.rst +++ b/docs/c-api.rst @@ -949,3 +949,60 @@ nodes need to be retained, and use .. literalinclude:: ../c/examples/multichrom_wright_fisher.c :language: c +---------------------------- +Reading and writing metadata +---------------------------- + +The C API does not provide any functionality for manipulating +the contents of metadata. For JSON metadata it is easy to +parse metadata using an external JSON library, and for +struct-encoded metadata the values can be directly unpacked. +Examples of both can be found in +`the SLiM code `_. + +The :ref:`"json+struct" ` +metadata codec is a little less straightforward to use, +so we provide here an example of how to write to it +and read from it in C. See :ref:`sec_metadata_codecs_jsonstruct` +for details of how the metadata is encoded. +(In Python, tskit automatically decodes both JSON and binary +metadata and provides it as Python-data-typed metadata, +just as for other codecs.) + +The structure of this example is as follows: + +1. Values specific to the metadata's header (e.g., the magic bytes `JBLB`). +2. Functions that encode/decode `uint64_t`, used to store the lengths + of the two components in the header. +3. A method to "read" the metadata: really, to get pointers to the + json and struct components. +4. A method to "write" the metadata, again just given pointers to + and lengths of the two components. +5. The program itself just round-trips a very simple chunk of metadata, + consisting of the JSON "`{"a": 1}`" and some binary `uint8_t` bytes ("`1234`"). + +.. literalinclude:: ../c/examples/json_struct_metadata.c + :language: c + +Much of the complexity of the code is careful error checking of the lengths. + +Here ``json_struct_codec_get_components`` takes a pointer to binary metadata +and returns pointers to *within that memory*. +A different approach might have copied the two portions of the metadata +into two buffers (to then be decoded, for instance). +However, that would double the memory footprint, +and since this codec is intended for large metadata, +we did not use that approach in this example. + +Along the same lines, it is worth noting that this example does make a copy of +the JSON and binary data when writing, in ``json_struct_codec_create_buffer()``, +which doubles the memory footprint at that point, and adds the +overhead of copying the data. A more efficient approach would be to calculate +the buffer length needed for the codec’s data, allocate the buffer with that +length, and then generate the necessary JSON and binary metadata directly into +that buffer. This would require the metadata-generating code to be more +closely entwined with the code for handling the json+struct codec header and +padding bytes, and so we have chosen not to adopt that approach here, for +pedagogical purposes; but if your use of this codec will involve large +metadata, such an approach is recommended. + diff --git a/docs/development.md b/docs/development.md index 87c92024f5..540236225b 100644 --- a/docs/development.md +++ b/docs/development.md @@ -832,7 +832,7 @@ To generate and view coverage reports for the C tests locally: Compile with coverage enabled: ```bash cd c - meson build -D b_coverage=true + meson setup build -D b_coverage=true ninja -C build ``` @@ -853,7 +853,7 @@ Lines prefixed with `#####` were never executed, lines with numbers show executi `lcov` can be used to create browsable HTML coverage reports: ```bash sudo apt-get install lcov # if needed - lcov --capture --directory build-gcc --output-file coverage.info + lcov --capture --directory build --output-file coverage.info genhtml coverage.info --output-directory coverage_html firefox coverage_html/index.html ``` diff --git a/docs/metadata.md b/docs/metadata.md index 915108c000..02d473ff98 100644 --- a/docs/metadata.md +++ b/docs/metadata.md @@ -527,7 +527,7 @@ of `B`, `H`, `I`, `L` or `Q` which have the same meaning as in the numeric types above. `L` is the default. As an example: ``` -{"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"} +{"type": "array", "items": {"type":"number", "binaryFormat":"h"}, "arrayLengthFormat":"B"} ``` Will result in an array of 2 byte integers, prepended by a single-byte array-length. @@ -555,10 +555,81 @@ As a special case under the `struct` codec, the top-level type of metadata can b union of `object` and `null`. Set `"type": ["object", "null"]`. Properties should be defined as normal, and will be ignored if the metadata is `None`. +(sec_metadata_codecs_jsonstruct)= + +### `json+struct` + +An additional codec provides the ability to store *both* JSON and binary-encoded data. +This is provided for the case where we want to store some arbitrary metadata +(as JSON) along with a relatively large amount of data (as binary, for efficiency). +For instance, we might want to record a raster map of the sampled area +along with a few pieces of generic information (e.g., the name of the area). + +The metadata schema for "json+struct" metadata basically just specifies both +a JSON metadata schema and a struct metadata schema. +Each entry in the metadata is encoded with either the JSON or the struct codec. + +Specifically, the schema must contain: + +1. a `"json"` entry that is a valid JSON metadata schema (except it does + not need to specify the codec), and +2. a `"struct"` entry that is a valid struct metadata schema (except it also does + not need to specify the codec). + +Furthermore, these two sub-schemas must both define objects, +and must not both define the same property: +in other words, the names of the properties in these must not overlap. + +#### Binary representation + +The underlying structure of the JSON+struct codec is as follows. +(If you're just working with metadata in python via the tskit interface, +you don't need to worry about this; this is important if you need to write +metadata in C, for instance.) +(1) four magic bytes, the ASCII characters `J`, `B`, `L`, and `B`; +(2) a one-byte (`uint8_t`) version number (currently, `1`); +(3) a 64-bit (`uint64_t`) length in bytes for the JSON data; +(4) a 64-bit length in bytes for the binary (struct) data, also in little-endian format; +(5) the JSON data itself; +(6) zero-ed "padding" bytes to bring the start of the binary section +into 8-byte alignment; and +(7) the binary data. +The JSON data is encoded as ASCII, without a null terminating byte, +and the format of the binary data is specified using the "struct" portion +of the metadata schema, described :ref:`above `. + (sec_metadata_schema_examples)= ## Schema examples +### JSON codec + +The JSON codec requires very little: for instance, +``tskit.MetadataSchema.permissive_json()`` simply returns the schema +``{"codec":"json"}``. +Using this schema allows you to include arbitrary data in an entry. + +Here is a more structured schema: + +```{code-cell} +schema = { + "codec": "json", + "title": "Example Metadata", + "type": "object", + "properties": {"name": {"type": "string"}, "size": {"type": "number"}}, + "required": ["name", "size"], + "additionalProperties": False, +} +ms = tskit.MetadataSchema(schema) +encoded = ms.validate_and_encode_row({ + "name": "abc", "size": 123 +}) +``` + +This schema has two properties: "name" and "size"; +"name" is a string and "size" is a number; +both are required, and no additional properties are allowed. + ### Struct codec As an example here is a schema using the `struct` codec which could apply, for example, @@ -622,6 +693,45 @@ unspecified properties will simply be missing in the returned metadata dictionar Also because this is a struct codec, `additionalProperties` must be set to False. This is assumed by default in the struct codec, but has been shown above for clarity. +### JSON+Struct codec + +A schema using the `"json+struct"` codec simply needs to specify +the JSON part and the struct part, and be sure the two do not share any keys. +Here is a simple example: + +```{code-cell} +schema = { + "codec": "json+struct", + "json": { + "type": "object", + "properties": { + "label": {"type": "string"}, + "id": {"type": "number"}, + }, + "required": ["label"], + }, + "struct": { + "type": "object", + "properties": { + "values": { + "type": "array", + "arrayLengthFormat": "B", + "items": {"type": "number", "binaryFormat": "i"}, + }, + }, + }, +} +ms = tskit.MetadataSchema(schema) +row = {"label": "alpha", "id": 7, "values": [5, 10, 2, 12]} +encoded = ms.validate_and_encode_row(row) +print("Encoded:", encoded) +print("Decoded:", ms.decode_row(encoded)) +``` + +This encodes two things in JSON: a label and an ID number, +and the uses the ``struct`` codec to encode an array of integers in binary. + + (sec_metadata_api_overview)= ## Python Metadata API Overview