From fcb011a8ae2d703bbec964b9f94876172c90f949 Mon Sep 17 00:00:00 2001 From: Yicong Huang Date: Thu, 12 Mar 2026 01:15:38 +0000 Subject: [PATCH 1/2] fix: write offset buffer for empty variable-width vectors per Arrow spec --- .../adapter/jdbc/ResultSetUtilityTest.java | 22 ++++++----- .../vector/BaseLargeVariableWidthVector.java | 16 +++++++- .../arrow/vector/BaseVariableWidthVector.java | 16 +++++++- .../apache/arrow/vector/TestValueVector.java | 38 +++++++++++++++++++ 4 files changed, 79 insertions(+), 13 deletions(-) diff --git a/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java b/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java index c7dc9b2791..e5039ccf59 100644 --- a/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java +++ b/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java @@ -43,15 +43,19 @@ public void testZeroRowResultSet() throws Exception { .setReuseVectorSchemaRoot(reuseVectorSchemaRoot) .build(); - ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs, config); - assertTrue(iter.hasNext(), "Iterator on zero row ResultSet should haveNext() before use"); - VectorSchemaRoot root = iter.next(); - assertNotNull(root, "VectorSchemaRoot from first next() result should never be null"); - assertEquals( - 0, root.getRowCount(), "VectorSchemaRoot from empty ResultSet should have zero rows"); - assertFalse( - iter.hasNext(), - "hasNext() should return false on empty ResultSets after initial next() call"); + try (ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs, config)) { + assertTrue(iter.hasNext(), "Iterator on zero row ResultSet should haveNext() before use"); + VectorSchemaRoot root = iter.next(); + assertNotNull(root, "VectorSchemaRoot from first next() result should never be null"); + assertEquals( + 0, root.getRowCount(), "VectorSchemaRoot from empty ResultSet should have zero rows"); + assertFalse( + iter.hasNext(), + "hasNext() should return false on empty ResultSets after initial next() call"); + if (!reuseVectorSchemaRoot) { + root.close(); + } + } } } } diff --git a/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 6c451f10a7..3fac195786 100644 --- a/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -373,14 +373,26 @@ private void setReaderAndWriterIndex() { valueBuffer.readerIndex(0); if (valueCount == 0) { validityBuffer.writerIndex(0); - offsetBuffer.writerIndex(0); valueBuffer.writerIndex(0); } else { final long lastDataOffset = getStartOffset(valueCount); validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount)); - offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH); valueBuffer.writerIndex(lastDataOffset); } + // IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`. + // Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers + // in other libraries. According to Arrow spec, we should still output the offset buffer which + // is [0]. + final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH; + if (offsetBuffer.capacity() < requiredOffsetBufferSize) { + ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize); + if (offsetBuffer.capacity() > 0) { + newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); + } + offsetBuffer.getReferenceManager().release(); + offsetBuffer = newOffsetBuffer; + } + offsetBuffer.writerIndex(requiredOffsetBufferSize); } /** Same as {@link #allocateNewSafe()}. */ diff --git a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 96e2afbd29..d5bd167256 100644 --- a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -389,14 +389,26 @@ private void setReaderAndWriterIndex() { valueBuffer.readerIndex(0); if (valueCount == 0) { validityBuffer.writerIndex(0); - offsetBuffer.writerIndex(0); valueBuffer.writerIndex(0); } else { final int lastDataOffset = getStartOffset(valueCount); validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount)); - offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH); valueBuffer.writerIndex(lastDataOffset); } + // IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`. + // Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers + // in other libraries. According to Arrow spec, we should still output the offset buffer which + // is [0]. + final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH; + if (offsetBuffer.capacity() < requiredOffsetBufferSize) { + ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize); + if (offsetBuffer.capacity() > 0) { + newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); + } + offsetBuffer.getReferenceManager().release(); + offsetBuffer = newOffsetBuffer; + } + offsetBuffer.writerIndex(requiredOffsetBufferSize); } /** Same as {@link #allocateNewSafe()}. */ diff --git a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index df42d04e60..22c93b0cbe 100644 --- a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -3940,4 +3940,42 @@ public void testVectorLoadUnloadOnNonVariadicVectors() { } } } + + @Test + public void testEmptyVarCharOffsetBuffer() { + // Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0) + // even when valueCount is 0, per Arrow specification. + try (VarCharVector vector = newVarCharVector("varchar", allocator)) { + vector.allocateNew(); + vector.setValueCount(0); + + List buffers = vector.getFieldBuffers(); + // buffers: [validity, offset, data] + assertTrue( + buffers.get(1).readableBytes() >= BaseVariableWidthVector.OFFSET_WIDTH, + "Offset buffer should have at least " + + BaseVariableWidthVector.OFFSET_WIDTH + + " bytes for offset[0]"); + assertEquals(0, vector.getOffsetBuffer().getInt(0)); + } + } + + @Test + public void testEmptyLargeVarCharOffsetBuffer() { + // Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0) + // even when valueCount is 0, per Arrow specification. + try (LargeVarCharVector vector = new LargeVarCharVector("largevarchar", allocator)) { + vector.allocateNew(); + vector.setValueCount(0); + + List buffers = vector.getFieldBuffers(); + // buffers: [validity, offset, data] + assertTrue( + buffers.get(1).readableBytes() >= BaseLargeVariableWidthVector.OFFSET_WIDTH, + "Offset buffer should have at least " + + BaseLargeVariableWidthVector.OFFSET_WIDTH + + " bytes for offset[0]"); + assertEquals(0, vector.getOffsetBuffer().getLong(0)); + } + } } From b0a6a122eaee6a6ca2a82fa3c60c88a7804aa61a Mon Sep 17 00:00:00 2001 From: Yicong Huang Date: Thu, 12 Mar 2026 03:40:05 +0000 Subject: [PATCH 2/2] chore: retrigger CI