Skip to content

Pack Objects API Reference

This document provides a complete API reference for the gitpy.storage.pack, gitpy.storage.pack_index, gitpy.storage.pack_writer, and gitpy.storage.delta modules, which implement Git's pack file format.

Module Overview

from gitpy.storage.pack import (
    PackFile,           # Pack file reader
    PackObject,         # Object read from pack
    PackObjectType,     # Object type enum
    PACK_SIGNATURE,     # Magic bytes b"PACK"
    PACK_VERSION,       # Version number (2)
)

from gitpy.storage.pack_index import (
    PackIndex,          # Pack index for fast lookup
    PackIndexEntry,     # Single index entry
    IDX_SIGNATURE,      # Magic bytes for index
    IDX_VERSION,        # Index version (2)
)

from gitpy.storage.pack_writer import (
    PackWriter,         # Pack file writer
    PackEntry,          # Object to be packed
)

from gitpy.storage.delta import (
    DeltaInsert,        # Insert instruction
    DeltaCopy,          # Copy instruction
    DeltaOp,            # Union type for operations
    parse_delta,        # Parse delta instructions
    apply_delta,        # Apply delta to base
    create_delta,       # Create delta between objects
)

PackObjectType

Module: gitpy.storage.pack

Enumeration of object types in pack files.

class PackObjectType(IntEnum):
    """Object types stored in pack files."""
    COMMIT = 1
    TREE = 2
    BLOB = 3
    TAG = 4
    # 5 is reserved
    OFS_DELTA = 6
    REF_DELTA = 7

Values

Name Value Description
COMMIT 1 Commit object
TREE 2 Tree object
BLOB 3 Blob object
TAG 4 Annotated tag object
OFS_DELTA 6 Delta with offset reference to base
REF_DELTA 7 Delta with SHA-1 reference to base

Methods

to_object_type()

def to_object_type(self) -> str

Convert to string type name.

Returns: "commit", "tree", "blob", "tag", or "delta".

Example:

obj_type = PackObjectType.BLOB
print(obj_type.to_object_type())  # "blob"

from_object_type()

@classmethod
def from_object_type(cls, type_name: str) -> "PackObjectType"

Convert from string type name.

Parameters: - type_name - Object type string ("commit", "tree", "blob", "tag").

Returns: Corresponding PackObjectType enum value.

Raises: - KeyError - Unknown type name.

Example:

obj_type = PackObjectType.from_object_type("blob")
print(obj_type)  # PackObjectType.BLOB

is_delta()

@property
def is_delta(self) -> bool

Check if this is a delta type.

Returns: True if OFS_DELTA or REF_DELTA, False otherwise.


PackObject

Module: gitpy.storage.pack

Represents an object read from a pack file.

@dataclass(slots=True)
class PackObject:
    """Object read from pack file."""
    sha: str        # 40-character hex SHA
    type_name: str  # "blob", "tree", "commit", or "tag"
    data: bytes     # Uncompressed object content
    offset: int     # Offset in pack file

Attributes

Name Type Description
sha str 40-character hex SHA-1 hash
type_name str Object type ("blob", "tree", "commit", "tag")
data bytes Decompressed, fully resolved object content
offset int Byte offset in pack file

PackFile

Module: gitpy.storage.pack

Reader for Git pack files. Provides random access to objects using the pack index.

class PackFile:
    pack_path: Path          # Path to .pack file
    version: int             # Pack format version
    object_count: int        # Number of objects in pack
    index: PackIndex         # Associated index

Constructor

def __init__(
    self,
    pack_path: Path,
    index: PackIndex | None = None
) -> None

Open a pack file.

Parameters: - pack_path - Path to .pack file. - index - Optional pre-loaded index. If not provided, loads from .idx file or builds from pack.

Raises: - ValueError - Invalid pack signature or unsupported version. - FileNotFoundError - Pack file doesn't exist.

Example:

from pathlib import Path
from gitpy.storage.pack import PackFile

pack = PackFile(Path(".git/objects/pack/pack-abc123.pack"))
print(f"Pack version: {pack.version}")
print(f"Object count: {pack.object_count}")

Methods

read_object()

def read_object(self, sha: str) -> PackObject | None

Read object by SHA.

Parameters: - sha - 40-character hex SHA-1.

Returns: PackObject if found, None if not in this pack.

Example:

obj = pack.read_object("ce013625030ba8dba906f756967f9e9ca394464a")
if obj:
    print(f"Type: {obj.type_name}")
    print(f"Size: {len(obj.data)}")

__contains__()

def __contains__(self, sha: str) -> bool

Check if object exists in pack.

Parameters: - sha - 40-character hex SHA-1.

Returns: True if object is in this pack.

Example:

if "ce013625..." in pack:
    print("Object found!")

__iter__()

def __iter__(self) -> Iterator[PackObject]

Iterate over all objects in pack.

Yields: PackObject for each object in the pack.

Example:

for obj in pack:
    print(f"{obj.sha} {obj.type_name} {len(obj.data)}")

verify_checksum()

def verify_checksum(self) -> bool

Verify pack file integrity.

Returns: True if checksum matches, False if corrupted.

Example:

if not pack.verify_checksum():
    raise ValueError("Pack file corrupted!")

PackIndexEntry

Module: gitpy.storage.pack_index

A single entry in a pack index.

@dataclass(slots=True)
class PackIndexEntry:
    """Single entry in pack index."""
    sha: str       # 40-character hex SHA
    offset: int    # Offset in pack file
    crc32: int     # CRC32 of packed object data

Attributes

Name Type Description
sha str 40-character hex SHA-1 hash
offset int Byte offset in pack file
crc32 int CRC32 checksum of compressed data

PackIndex

Module: gitpy.storage.pack_index

Pack index for fast object lookup. Provides O(log n) SHA to offset lookup.

class PackIndex:
    pack_sha: str                    # SHA of associated pack file
    entries: list[PackIndexEntry]    # Sorted by SHA
    object_count: int                # Number of objects

Constructor

def __init__(
    self,
    pack_sha: str,
    entries: list[PackIndexEntry]
) -> None

Create pack index from entries.

Parameters: - pack_sha - 40-character hex SHA of the pack file. - entries - List of PackIndexEntry objects (will be sorted).

Example:

from gitpy.storage.pack_index import PackIndex, PackIndexEntry

entries = [
    PackIndexEntry(sha="a" * 40, offset=12, crc32=0x12345678),
    PackIndexEntry(sha="b" * 40, offset=500, crc32=0xdeadbeef),
]
index = PackIndex(pack_sha="c" * 40, entries=entries)

Properties

object_count

@property
def object_count(self) -> int

Number of objects in pack.

Returns: Total object count.

Methods

find()

def find(self, sha: str) -> PackIndexEntry | None

Look up entry by SHA.

Parameters: - sha - 40-character hex SHA-1.

Returns: PackIndexEntry if found, None otherwise.

Example:

entry = index.find("ce013625030ba8dba906f756967f9e9ca394464a")
if entry:
    print(f"Offset: {entry.offset}")
    print(f"CRC32: {entry.crc32:08x}")

get_offset()

def get_offset(self, sha: str) -> int | None

Get pack file offset for SHA.

Parameters: - sha - 40-character hex SHA-1.

Returns: Byte offset if found, None otherwise.

Example:

offset = index.get_offset("ce013625...")
if offset is not None:
    print(f"Object at offset {offset}")

__contains__()

def __contains__(self, sha: str) -> bool

Check if SHA is in index.

Parameters: - sha - 40-character hex SHA-1.

Returns: True if found.

from_file()

@classmethod
def from_file(cls, path: Path) -> "PackIndex"

Load index from .idx file.

Parameters: - path - Path to index file.

Returns: Parsed PackIndex.

Raises: - ValueError - Invalid index format or version.

Example:

index = PackIndex.from_file(Path(".git/objects/pack/pack-abc123.idx"))

parse()

@classmethod
def parse(cls, data: bytes) -> "PackIndex"

Parse index from raw bytes.

Parameters: - data - Raw index file content.

Returns: Parsed PackIndex.

Raises: - ValueError - Invalid signature or version.

serialize()

def serialize(self) -> bytes

Serialize to version 2 index format.

Returns: Raw bytes ready to write to .idx file.

Example:

data = index.serialize()
Path("pack-abc123.idx").write_bytes(data)

PackEntry

Module: gitpy.storage.pack_writer

Object to be written to a pack file.

@dataclass(slots=True)
class PackEntry:
    """Object to be written to pack."""
    sha: str                          # 40-character hex SHA
    type_name: str                    # Object type
    data: bytes                       # Object content (or delta)
    delta_base_sha: str | None = None # Base SHA if deltified

Attributes

Name Type Description
sha str 40-character hex SHA-1
type_name str Object type ("blob", "tree", etc.)
data bytes Object content or delta instructions
delta_base_sha str \| None If deltified, SHA of base object

PackWriter

Module: gitpy.storage.pack_writer

Writer for Git pack files. Creates .pack and .idx files from objects.

class PackWriter:
    objects_dir: Path    # .git/objects directory
    pack_dir: Path       # .git/objects/pack directory

Constructor

def __init__(self, objects_dir: Path) -> None

Initialize pack writer.

Parameters: - objects_dir - Path to .git/objects directory.

Example:

from gitpy.storage.pack_writer import PackWriter
from pathlib import Path

writer = PackWriter(Path(".git/objects"))

Methods

write_pack()

def write_pack(
    self,
    objects: Iterable[GitObject],
    *,
    deltify: bool = True,
    window_size: int = 10,
) -> tuple[Path, Path]

Write objects to a new pack file.

Parameters: - objects - Iterable of GitObject instances to pack. - deltify - If True, apply delta compression (default: True). - window_size - Number of recent objects to consider as delta bases (default: 10).

Returns: Tuple of (pack_path, index_path).

Example:

from gitpy.objects import Blob

objects = [
    Blob(data=b"content 1"),
    Blob(data=b"content 2"),
    Blob(data=b"content 3"),
]

pack_path, idx_path = writer.write_pack(objects)
print(f"Pack: {pack_path}")
print(f"Index: {idx_path}")

write_pack_from_entries()

def write_pack_from_entries(
    self,
    entries: list[PackEntry],
) -> tuple[Path, Path]

Write pre-prepared entries to pack.

Parameters: - entries - List of PackEntry objects (may include deltas).

Returns: Tuple of (pack_path, index_path).


Delta Types

Module: gitpy.storage.delta

DeltaInsert

Instruction to insert literal data.

@dataclass(slots=True)
class DeltaInsert:
    """Insert literal data into result."""
    data: bytes

DeltaCopy

Instruction to copy from base object.

@dataclass(slots=True)
class DeltaCopy:
    """Copy bytes from base object."""
    offset: int  # Byte offset in base
    size: int    # Number of bytes to copy

DeltaOp

Union type for delta operations.

type DeltaOp = DeltaInsert | DeltaCopy

Delta Functions

Module: gitpy.storage.delta

parse_delta()

def parse_delta(data: bytes) -> tuple[int, int, list[DeltaOp]]

Parse delta instructions.

Parameters: - data - Raw delta bytes.

Returns: Tuple of (source_size, target_size, operations).

Raises: - ValueError - Invalid delta instruction (e.g., 0x00 byte).

Example:

from gitpy.storage.delta import parse_delta

source_size, target_size, ops = parse_delta(delta_bytes)
print(f"Base size: {source_size}")
print(f"Result size: {target_size}")
for op in ops:
    print(op)

apply_delta()

def apply_delta(base: bytes, delta_data: bytes) -> bytes

Apply delta to reconstruct target.

Parameters: - base - Source/base object data. - delta_data - Raw delta bytes.

Returns: Reconstructed target data.

Raises: - ValueError - Size mismatch or invalid delta.

Example:

from gitpy.storage.delta import apply_delta

base = b"Hello, World!"
result = apply_delta(base, delta_bytes)
print(result)  # b"Hello, Git World!"

create_delta()

def create_delta(source: bytes, target: bytes) -> bytes

Create delta from source to target.

Parameters: - source - Base object data. - target - Target object data.

Returns: Delta bytes that transform source into target.

Example:

from gitpy.storage.delta import create_delta

source = b"Hello, World!"
target = b"Hello, Git World!"

delta = create_delta(source, target)
print(f"Delta size: {len(delta)} (vs {len(target)} for full)")

read_delta_size()

def read_delta_size(data: bytes, offset: int) -> tuple[int, int]

Read variable-length size from delta header.

Parameters: - data - Delta data bytes. - offset - Starting position.

Returns: Tuple of (size, bytes_consumed).


Variable-Length Integer Functions

Module: gitpy.storage.pack

read_pack_object_header()

def read_pack_object_header(
    data: bytes,
    offset: int
) -> tuple[int, int, int]

Read pack object header.

Parameters: - data - Pack file data. - offset - Byte offset to start reading.

Returns: Tuple of (object_type, uncompressed_size, bytes_consumed).

Example:

obj_type, size, consumed = read_pack_object_header(pack_data, offset)
print(f"Type: {PackObjectType(obj_type).to_object_type()}")
print(f"Size: {size}")

write_pack_object_header()

def write_pack_object_header(obj_type: int, size: int) -> bytes

Encode pack object header.

Parameters: - obj_type - Object type value (1-4, 6-7). - size - Uncompressed object size.

Returns: Encoded header bytes.

read_ofs_delta_offset()

def read_ofs_delta_offset(
    data: bytes,
    offset: int
) -> tuple[int, int]

Read OFS_DELTA negative offset.

Parameters: - data - Pack data. - offset - Starting position.

Returns: Tuple of (base_offset, bytes_consumed).

write_ofs_delta_offset()

def write_ofs_delta_offset(offset: int) -> bytes

Encode OFS_DELTA negative offset.

Parameters: - offset - Positive offset value to encode.

Returns: Encoded offset bytes.


Constants

Pack File Constants

PACK_SIGNATURE = b"PACK"  # Magic bytes at start of pack
PACK_VERSION = 2          # Current pack format version

Index File Constants

IDX_SIGNATURE = b"\xff\x74\x4f\x63"  # Magic: 0xff744f63
IDX_VERSION = 2                       # Current index format version

Complete Example

from pathlib import Path
from gitpy.repository import Repository
from gitpy.objects import Blob
from gitpy.storage.pack import PackFile
from gitpy.storage.pack_writer import PackWriter
from gitpy.storage.delta import create_delta, apply_delta

# Initialize repository
repo = Repository.init(Path("/tmp/pack-demo"))

# Create some similar blobs
base_content = b"Hello, World!\n" * 100
blobs = [
    Blob(data=base_content),
    Blob(data=base_content + b"Line 2\n"),
    Blob(data=base_content + b"Line 2\nLine 3\n"),
]

# Write as loose objects first
for blob in blobs:
    repo.objects.write(blob)

# Create pack from objects
writer = PackWriter(repo.git_dir / "objects")
pack_path, idx_path = writer.write_pack(blobs, deltify=True)

print(f"Pack created: {pack_path}")
print(f"Index created: {idx_path}")
print(f"Pack size: {pack_path.stat().st_size} bytes")

# Read back from pack
pack = PackFile(pack_path)
print(f"Objects in pack: {pack.object_count}")

for obj in pack:
    print(f"  {obj.sha[:8]} {obj.type_name:6} {len(obj.data):5} bytes")

# Verify object can be read
first_blob = blobs[0]
restored = pack.read_object(first_blob.oid)
assert restored is not None
assert restored.data == first_blob.data
print("Pack verification passed!")

# Delta compression demo
source = b"The quick brown fox jumps over the lazy dog"
target = b"The quick brown cat jumps over the lazy dog"

delta = create_delta(source, target)
print(f"\nDelta compression:")
print(f"  Source: {len(source)} bytes")
print(f"  Target: {len(target)} bytes")
print(f"  Delta:  {len(delta)} bytes ({len(delta)/len(target)*100:.1f}%)")

restored = apply_delta(source, delta)
assert restored == target
print("Delta verification passed!")

Git Compatibility

The pack modules are fully compatible with Git:

Test Result
gitpy reads Git-created packs
Git reads gitpy-created packs
git verify-pack validates gitpy packs
Delta chains resolve correctly
Large offsets (>2GB) supported

Verification:

# Create pack with gitpy, verify with Git
python3 -c "
from gitpy import Repository
repo = Repository.find()
repo.objects.repack()
"
git verify-pack -v .git/objects/pack/*.pack

# Create pack with Git, read with gitpy
git gc
python3 -c "
from gitpy import Repository
repo = Repository.find()
# gitpy can read all objects from Git's pack
"

Performance Notes

Lookup Complexity

Operation Time Complexity
Find by SHA (with index) O(log n)
Read undeltified object O(1)
Read deltified object O(depth)
Iterate all objects O(n)

Memory Usage

  • PackFile: Memory-maps pack data, ~constant overhead
  • PackIndex: ~28 bytes per object in memory
  • Delta resolution: Caches resolved objects to avoid re-resolution

Best Practices

# Reuse PackFile instances (don't reopen repeatedly)
pack = PackFile(path)
for sha in shas_to_read:
    obj = pack.read_object(sha)

# Use deltify=True for similar objects
writer.write_pack(similar_objects, deltify=True)

# Use larger window for better compression (slower)
writer.write_pack(objects, deltify=True, window_size=50)