Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 159 additions & 0 deletions c/examples/json_struct_metadata.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include <string.h>
#include <tskit.h>

// these are properties of the ``json+struct`` codec, documented in tskit
#define JSON_STRUCT_HEADER_SIZE 21

const uint8_t json_struct_codec_magic[4] = { 'J', 'B', 'L', 'B' };
const uint8_t json_struct_codec_version = 1;

// little-endian read of a uint64_t from an address
static uint64_t
load_u64_le(const uint8_t *p)
{
uint64_t value = (uint64_t) p[0];
value |= (uint64_t) p[1] << 8;
value |= (uint64_t) p[2] << 16;
value |= (uint64_t) p[3] << 24;
value |= (uint64_t) p[4] << 32;
value |= (uint64_t) p[5] << 40;
value |= (uint64_t) p[6] << 48;
value |= (uint64_t) p[7] << 56;
return value;
}

// little-endian write of a uint64_t to an address
static void
set_u64_le(uint8_t *dest, uint64_t value)
{
dest[0] = (uint8_t) (value & 0xFF);
dest[1] = (uint8_t) ((value >> 8) & 0xFF);
dest[2] = (uint8_t) ((value >> 16) & 0xFF);
dest[3] = (uint8_t) ((value >> 24) & 0xFF);
dest[4] = (uint8_t) ((value >> 32) & 0xFF);
dest[5] = (uint8_t) ((value >> 40) & 0xFF);
dest[6] = (uint8_t) ((value >> 48) & 0xFF);
dest[7] = (uint8_t) ((value >> 56) & 0xFF);
}

// Extract the json and binary payloads from the `json+struct` codec data buffer.
// Note that the output pointers `json` and `binary` reference memory
// inside the `metadata` buffer passed in.
void
json_struct_codec_get_components(uint8_t *metadata, tsk_size_t metadata_length,
uint8_t **json, tsk_size_t *json_length, uint8_t **binary, tsk_size_t *binary_length)
{
// check the structure of the codec header and the sizes it specifies
if (metadata == NULL || json == NULL || json_length == NULL || binary == NULL
|| binary_length == NULL)
errx(EXIT_FAILURE, "bad parameter value.");
if (metadata_length < JSON_STRUCT_HEADER_SIZE)
errx(EXIT_FAILURE, "metadata truncated.");
if (memcmp(metadata, json_struct_codec_magic, sizeof(json_struct_codec_magic)) != 0)
errx(EXIT_FAILURE, "bad magic bytes.");

uint8_t version = metadata[4];
if (version != json_struct_codec_version)
errx(EXIT_FAILURE, "bad version number.");

uint64_t json_length_u64 = load_u64_le(metadata + 5);
uint64_t binary_length_u64 = load_u64_le(metadata + 13);
if (json_length_u64 > UINT64_MAX - (uint64_t) JSON_STRUCT_HEADER_SIZE)
errx(EXIT_FAILURE, "invalid length.");

// determine the number of padding bytes and do more safety checks
uint64_t length = (uint64_t) JSON_STRUCT_HEADER_SIZE + json_length_u64;
uint64_t padding_length = (8 - (length & 0x07)) % 8;
if (padding_length > UINT64_MAX - length)
errx(EXIT_FAILURE, "invalid length.");

length += padding_length;
if (binary_length_u64 > UINT64_MAX - length)
errx(EXIT_FAILURE, "invalid length.");

length += binary_length_u64;
if ((uint64_t) metadata_length != length)
errx(EXIT_FAILURE, "unexpected size.");

uint8_t *padding_start = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64;
for (uint64_t j = 0; j < padding_length; ++j)
if (*(padding_start + j) != 0)
errx(EXIT_FAILURE, "padding bytes are nonzero.");

// the structure of the codec data seems valid; return components
*json = metadata + JSON_STRUCT_HEADER_SIZE;
*json_length = (tsk_size_t) json_length_u64;

*binary = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64 + padding_length;
*binary_length = (tsk_size_t) binary_length_u64;
}

// malloc and return a data buffer for the `json+struct` codec
// that contains the given components
void
json_struct_codec_create_buffer(const uint8_t *json, tsk_size_t json_length,
const uint8_t *binary, tsk_size_t binary_length, uint8_t **buffer,
tsk_size_t *buffer_length)
{
// figure out the total length of the codec's data and allocate the buffer for it
tsk_size_t header_length = JSON_STRUCT_HEADER_SIZE;
tsk_size_t padding_length = (8 - ((header_length + json_length) & 0x07)) % 8;
tsk_size_t total_length
= header_length + json_length + padding_length + binary_length;
uint8_t *bytes = malloc(total_length);
if (!bytes)
errx(EXIT_FAILURE, "memory for buffer could not be allocated.");

// then set up the bytes for the codec header
memcpy(bytes, json_struct_codec_magic, 4);
bytes[4] = json_struct_codec_version;
set_u64_le(bytes + 5, (uint64_t) json_length);
set_u64_le(bytes + 13, (uint64_t) binary_length);

// copy in the JSON and binary data, separated by the padding bytes; the goal of the
// padding bytes is to ensure that the binary data is 8-byte-aligned relative to the
// start of the buffer
memcpy(bytes + header_length, json, json_length);
memset(bytes + header_length + json_length, 0, padding_length);
memcpy(bytes + header_length + json_length + padding_length, binary, binary_length);

// return the buffer and its length; the caller takes ownership of the buffer
*buffer = bytes;
*buffer_length = total_length;
}

int
main(int argc, char **argv)
{
// we start with JSON and binary payloads that we encode into a new buffer
// note that the JSON payload does not have to end with a trailing NULL
const char json_payload[] = { '{', '"', 'a', '"', ':', '1', '}' };
const uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 };
uint8_t *metadata;
tsk_size_t metadata_length;

json_struct_codec_create_buffer((const uint8_t *) json_payload, sizeof(json_payload),
binary_payload, sizeof(binary_payload), &metadata, &metadata_length);

// then we decode that buffer to recover the json and binary data
uint8_t *decoded_json, *decoded_binary;
tsk_size_t decoded_json_length, decoded_binary_length;

json_struct_codec_get_components(metadata, metadata_length, &decoded_json,
&decoded_json_length, &decoded_binary, &decoded_binary_length);

// print the recovered data to demonstrate that the round-trip worked
// note that the JSON data is not NULL-terminated unless you put a NULL there!
printf("JSON: %.*s\n", (int) decoded_json_length, decoded_json);

printf("Binary data:");
for (tsk_size_t j = 0; j < decoded_binary_length; j++)
printf(" %#04x", decoded_binary[j]);
printf("\n");

free(metadata);
return EXIT_SUCCESS;
}
3 changes: 3 additions & 0 deletions c/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ if not meson.is_subproject()
executable('multichrom_wright_fisher_singlethreaded',
sources: ['examples/multichrom_wright_fisher_singlethreaded.c'],
link_with: [tskit_lib], dependencies: lib_deps)
executable('json_struct_metadata',
sources: ['examples/json_struct_metadata.c'],
link_with: [tskit_lib], dependencies: lib_deps)

thread_dep = dependency('threads')
executable('multichrom_wright_fisher',
Expand Down
57 changes: 57 additions & 0 deletions docs/c-api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -949,3 +949,60 @@ nodes need to be retained, and use
.. literalinclude:: ../c/examples/multichrom_wright_fisher.c
:language: c

----------------------------
Reading and writing metadata
----------------------------

The C API does not provide any functionality for manipulating
the contents of metadata. For JSON metadata it is easy to
parse metadata using an external JSON library, and for
struct-encoded metadata the values can be directly unpacked.
Examples of both can be found in
`the SLiM code <https://messerlab.github.com/slim/>`_.

The :ref:`"json+struct" <sec_metadata_codecs_jsonstruct>`
metadata codec is a little less straightforward to use,
so we provide here an example of how to write to it
and read from it in C. See :ref:`sec_metadata_codecs_jsonstruct`
for details of how the metadata is encoded.
(In Python, tskit automatically decodes both JSON and binary
metadata and provides it as Python-data-typed metadata,
just as for other codecs.)

The structure of this example is as follows:

1. Values specific to the metadata's header (e.g., the magic bytes `JBLB`).
2. Functions that encode/decode `uint64_t`, used to store the lengths
of the two components in the header.
3. A method to "read" the metadata: really, to get pointers to the
json and struct components.
4. A method to "write" the metadata, again just given pointers to
and lengths of the two components.
5. The program itself just round-trips a very simple chunk of metadata,
consisting of the JSON "`{"a": 1}`" and some binary `uint8_t` bytes ("`1234`").

.. literalinclude:: ../c/examples/json_struct_metadata.c
:language: c

Much of the complexity of the code is careful error checking of the lengths.

Here ``json_struct_codec_get_components`` takes a pointer to binary metadata
and returns pointers to *within that memory*.
A different approach might have copied the two portions of the metadata
into two buffers (to then be decoded, for instance).
However, that would double the memory footprint,
and since this codec is intended for large metadata,
we did not use that approach in this example.

Along the same lines, it is worth noting that this example does make a copy of
the JSON and binary data when writing, in ``json_struct_codec_create_buffer()``,
which doubles the memory footprint at that point, and adds the
overhead of copying the data. A more efficient approach would be to calculate
the buffer length needed for the codec’s data, allocate the buffer with that
length, and then generate the necessary JSON and binary metadata directly into
that buffer. This would require the metadata-generating code to be more
closely entwined with the code for handling the json+struct codec header and
padding bytes, and so we have chosen not to adopt that approach here, for
pedagogical purposes; but if your use of this codec will involve large
metadata, such an approach is recommended.

4 changes: 2 additions & 2 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -832,7 +832,7 @@ To generate and view coverage reports for the C tests locally:
Compile with coverage enabled:
```bash
cd c
meson build -D b_coverage=true
meson setup build -D b_coverage=true
ninja -C build
```

Expand All @@ -853,7 +853,7 @@ Lines prefixed with `#####` were never executed, lines with numbers show executi
`lcov` can be used to create browsable HTML coverage reports:
```bash
sudo apt-get install lcov # if needed
lcov --capture --directory build-gcc --output-file coverage.info
lcov --capture --directory build --output-file coverage.info
genhtml coverage.info --output-directory coverage_html
firefox coverage_html/index.html
```
Expand Down
112 changes: 111 additions & 1 deletion docs/metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ of `B`, `H`, `I`, `L` or `Q` which have the same meaning as in the numeric
types above. `L` is the default. As an example:

```
{"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"}
{"type": "array", "items": {"type":"number", "binaryFormat":"h"}, "arrayLengthFormat":"B"}
```

Will result in an array of 2 byte integers, prepended by a single-byte array-length.
Expand Down Expand Up @@ -555,10 +555,81 @@ As a special case under the `struct` codec, the top-level type of metadata can b
union of `object` and `null`. Set `"type": ["object", "null"]`. Properties should
be defined as normal, and will be ignored if the metadata is `None`.

(sec_metadata_codecs_jsonstruct)=

### `json+struct`

An additional codec provides the ability to store *both* JSON and binary-encoded data.
This is provided for the case where we want to store some arbitrary metadata
(as JSON) along with a relatively large amount of data (as binary, for efficiency).
For instance, we might want to record a raster map of the sampled area
along with a few pieces of generic information (e.g., the name of the area).

The metadata schema for "json+struct" metadata basically just specifies both
a JSON metadata schema and a struct metadata schema.
Each entry in the metadata is encoded with either the JSON or the struct codec.

Specifically, the schema must contain:

1. a `"json"` entry that is a valid JSON metadata schema (except it does
not need to specify the codec), and
2. a `"struct"` entry that is a valid struct metadata schema (except it also does
not need to specify the codec).

Furthermore, these two sub-schemas must both define objects,
and must not both define the same property:
in other words, the names of the properties in these must not overlap.

#### Binary representation

The underlying structure of the JSON+struct codec is as follows.
(If you're just working with metadata in python via the tskit interface,
you don't need to worry about this; this is important if you need to write
metadata in C, for instance.)
(1) four magic bytes, the ASCII characters `J`, `B`, `L`, and `B`;
(2) a one-byte (`uint8_t`) version number (currently, `1`);
(3) a 64-bit (`uint64_t`) length in bytes for the JSON data;
(4) a 64-bit length in bytes for the binary (struct) data, also in little-endian format;
(5) the JSON data itself;
(6) zero-ed "padding" bytes to bring the start of the binary section
into 8-byte alignment; and
(7) the binary data.
The JSON data is encoded as ASCII, without a null terminating byte,
and the format of the binary data is specified using the "struct" portion
of the metadata schema, described :ref:`above <sec_metadata_codecs_struct>`.

(sec_metadata_schema_examples)=

## Schema examples

### JSON codec

The JSON codec requires very little: for instance,
``tskit.MetadataSchema.permissive_json()`` simply returns the schema
``{"codec":"json"}``.
Using this schema allows you to include arbitrary data in an entry.

Here is a more structured schema:

```{code-cell}
schema = {
"codec": "json",
"title": "Example Metadata",
"type": "object",
"properties": {"name": {"type": "string"}, "size": {"type": "number"}},
"required": ["name", "size"],
"additionalProperties": False,
}
ms = tskit.MetadataSchema(schema)
encoded = ms.validate_and_encode_row({
"name": "abc", "size": 123
})
```

This schema has two properties: "name" and "size";
"name" is a string and "size" is a number;
both are required, and no additional properties are allowed.

### Struct codec

As an example here is a schema using the `struct` codec which could apply, for example,
Expand Down Expand Up @@ -622,6 +693,45 @@ unspecified properties will simply be missing in the returned metadata dictionar
Also because this is a struct codec, `additionalProperties` must be set to False. This
is assumed by default in the struct codec, but has been shown above for clarity.

### JSON+Struct codec

A schema using the `"json+struct"` codec simply needs to specify
the JSON part and the struct part, and be sure the two do not share any keys.
Here is a simple example:

```{code-cell}
schema = {
"codec": "json+struct",
"json": {
"type": "object",
"properties": {
"label": {"type": "string"},
"id": {"type": "number"},
},
"required": ["label"],
},
"struct": {
"type": "object",
"properties": {
"values": {
"type": "array",
"arrayLengthFormat": "B",
"items": {"type": "number", "binaryFormat": "i"},
},
},
},
}
ms = tskit.MetadataSchema(schema)
row = {"label": "alpha", "id": 7, "values": [5, 10, 2, 12]}
encoded = ms.validate_and_encode_row(row)
print("Encoded:", encoded)
print("Decoded:", ms.decode_row(encoded))
```

This encodes two things in JSON: a label and an ID number,
and the uses the ``struct`` codec to encode an array of integers in binary.


(sec_metadata_api_overview)=

## Python Metadata API Overview
Expand Down
Loading