Created
April 2, 2020 01:23
-
-
Save wesm/d48908018c4b7a0d9789a31d10caf525 to your computer and use it in GitHub Desktop.
Example of round-tripping Arrow data through the new C ABI/Interface
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import pyarrow as pa\n", | |
| "\n", | |
| "from pyarrow.cffi import ffi" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pyarrow.RecordBatch\n", | |
| "a: int64\n", | |
| "b: string" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df = pd.DataFrame({'a': [1, 2, 3, 4, 5],\n", | |
| " 'b': ['a', 'b', 'c', 'd', 'e']})\n", | |
| "\n", | |
| "rb = pa.record_batch(df)\n", | |
| "rb" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Export pyarrow.RecordBatch to C Interface" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "c_schema = ffi.new(\"struct ArrowSchema*\")\n", | |
| "c_schema_ptr = int(ffi.cast(\"uintptr_t\", c_schema))\n", | |
| "\n", | |
| "# NB: RecordBatch is packed as a StructArray\n", | |
| "c_batch = arrow_c.new(\"struct ArrowArray*\")\n", | |
| "c_batch_ptr = int(ffi.cast(\"uintptr_t\", c_batch))\n", | |
| "\n", | |
| "rb.schema._export_to_c(c_schema_ptr)\n", | |
| "rb._export_to_c(c_batch_ptr)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Import pyarrow.RecordBatch given addresses of ArrowSchema, ArrowArray" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Deserialize schema\n", | |
| "schema2 = pa.Schema._import_from_c(c_schema_ptr)\n", | |
| "\n", | |
| "# Deserialize batch\n", | |
| "rb2 = pa.RecordBatch._import_from_c(c_batch_ptr, schema2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "rb.equals(rb2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>a</th>\n", | |
| " <th>b</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>a</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2</td>\n", | |
| " <td>b</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>3</td>\n", | |
| " <td>c</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>4</td>\n", | |
| " <td>d</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>5</td>\n", | |
| " <td>e</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " a b\n", | |
| "0 1 a\n", | |
| "1 2 b\n", | |
| "2 3 c\n", | |
| "3 4 d\n", | |
| "4 5 e" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "rb2.to_pandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.6" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Author
Hey Wes!
I think this solves it actually:
c_schema = ffi.new("struct ArrowSchema*")
c_schema_ptr = int(ffi.cast("uintptr_t", c_schema))
# NB: RecordBatch is packed as a StructArray
c_batch = arrow_c.new("struct ArrowArray*") # I think arrow_c is meant to be ffi here
c_batch_ptr = int(ffi.cast("uintptr_t", c_batch))
I'm working to make the C Data Interface work with Arrow.jl :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Where is
arrow_cdefined?