Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions docs/get_started.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ jupyter: python3
---

```{python}
#| include: false
# | include: false
import time
import pandas as pd
pd.options.display.max_rows = 25
```
Expand Down Expand Up @@ -126,7 +127,7 @@ While we’ll do our best to keep the automatically generated metadata consisten

## Versioning

Every [](`~pins.boards.BaseBoard.pin_write`) will create a new version:
By default, calls to [](`~pins.boards.BaseBoard.pin_write`) will usually create a new version:

```{python}
board2 = board_temp()
Expand All @@ -136,6 +137,23 @@ board2.pin_write([1,2], name = "x", type = "json")
board2.pin_versions("x")
```

The only exception is if the data is identical with the most recent version (compared via file hash):

```{python}
board2.pin_write([1], name = "x", type = "json")
time.sleep(1.1) # later, let's try and write a new version of the same data...
board2.pin_write([1], name = "x", type = "json")
board2.pin_versions("x")
```


However you can opt-out of this behaviour with `force_identical_write=True`:
```{python}
time.sleep(1.1) # try again...
board2.pin_write([1], name = "x", type = "json", force_identical_write=True)
board2.pin_versions("x")
```

By default, [](`~pins.boards.BaseBoard.pin_read`) will return the most recent version:

```{python}
Expand Down
47 changes: 45 additions & 2 deletions pins/boards.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ def _pin_store(
metadata: Mapping | None = None,
versioned: bool | None = None,
created: datetime | None = None,
*,
force_identical_write: bool = False,
) -> Meta:
if type == "feather":
warn_deprecated(
Expand All @@ -248,8 +250,16 @@ def _pin_store(

pin_name = self.path_to_pin(name)

# Pre-emptively fetch the most recent pin's meta if it exists - this is used
# for the force_identical_write check
abort_if_identical = not force_identical_write and self.pin_exists(name)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this may make writing pins slightly slower by making a call to the pin in self.pin_exists(). I think it's right move here (and I don't see a significant slowdown in tests at all), but just noting it if other calls get added and things start to feel sluggish

if abort_if_identical:
last_meta = self.pin_meta(name)

with tempfile.TemporaryDirectory() as tmp_dir:
# create all pin data (e.g. data.txt, save object)
# create all pin data (e.g. data.txt, save object) to get the metadata.
# For unversioned boards, this also will delete the most recent pin version,
# ready for it to be replaced with a new one.
meta = self.prepare_pin_version(
tmp_dir,
x,
Expand All @@ -263,6 +273,18 @@ def _pin_store(
object_name=object_name,
)

# force_identical_write check
if abort_if_identical:
last_hash = last_meta.pin_hash

if last_hash == meta.pin_hash:
msg = (
f'The hash of pin "{name}" has not changed. Your pin will not '
f"be stored.",
)
inform(log=_log, msg=msg)
return last_meta

# move pin to destination ----
# create pin version folder
dst_pin_path = self.construct_path([pin_name])
Expand Down Expand Up @@ -310,6 +332,8 @@ def pin_write(
metadata: Mapping | None = None,
versioned: bool | None = None,
created: datetime | None = None,
*,
force_identical_write: bool = False,
) -> Meta:
"""Write a pin object to the board.

Expand All @@ -336,6 +360,17 @@ def pin_write(
created:
A date to store in the Meta.created field. This field may be used as
part of the pin version name.
force_identical_write:
Store the pin even if the pin contents are identical to the last version
(compared using the hash). Only the pin contents are compared, not the pin
metadata. Defaults to False.

Returns
-------
Meta:
Metadata about the stored pin. If `force_identical_write` is False and the
pin contents are identical to the last version, the last version's metadata
is returned.
"""

if type == "file":
Expand All @@ -345,7 +380,15 @@ def pin_write(
)

return self._pin_store(
x, name, type, title, description, metadata, versioned, created
x,
name,
type,
title,
description,
metadata,
versioned,
created,
force_identical_write=force_identical_write,
)

def pin_download(self, name, version=None, hash=None) -> Sequence[str]:
Expand Down
93 changes: 87 additions & 6 deletions pins/tests/test_boards.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,40 @@ def test_board_pin_write_file_raises_error(board, tmp_path):
board.pin_write(path, "cool_pin", type="file")


@pytest.mark.parametrize("force_identical_write", [True, False])
def test_board_pin_write_force_identical_write_pincount(board, force_identical_write):
df = pd.DataFrame({"x": [1, 2, 3]})

# 1min ago to avoid name collision
one_min_ago = datetime.now() - timedelta(minutes=1)
board.pin_write(df, "cool_pin", type="csv", created=one_min_ago)
board.pin_write(
df, "cool_pin", type="csv", force_identical_write=force_identical_write
)
versions = board.pin_versions("cool_pin")
if force_identical_write:
assert len(versions) == 2
else:
assert len(versions) == 1


def test_board_pin_write_force_identical_write_msg(
board, capfd: pytest.CaptureFixture[str]
):
df = pd.DataFrame({"x": [1, 2, 3]})

# 1min ago to avoid name collision
one_min_ago = datetime.now() - timedelta(minutes=1)
board.pin_write(df, "cool_pin", type="csv", created=one_min_ago)
board.pin_write(df, "cool_pin", type="csv")
versions = board.pin_versions("cool_pin")

_, err = capfd.readouterr()
msg = 'The hash of pin "cool_pin" has not changed. Your pin will not be stored.'
assert msg in err
assert len(versions) == 1


def test_board_pin_download(board_with_cache, tmp_path):
# create and save data
df = pd.DataFrame({"x": [1, 2, 3]})
Expand Down Expand Up @@ -309,6 +343,32 @@ def test_board_pin_read_insecure_succeed_board_flag(board):
# pin_write with unversioned boards ===========================================


@pytest.mark.parametrize("versioned", [None, False])
def test_board_unversioned_pin_write_unversioned_force_identical_write(
versioned, board_unversioned
):
# 1min ago to avoid name collision
one_min_ago = datetime.now() - timedelta(minutes=1)
board_unversioned.pin_write(
{"a": 1},
"test_pin",
type="json",
versioned=versioned,
created=one_min_ago,
force_identical_write=True,
)
board_unversioned.pin_write(
{"a": 2},
"test_pin",
type="json",
versioned=versioned,
force_identical_write=True,
)

assert len(board_unversioned.pin_versions("test_pin")) == 1
assert board_unversioned.pin_read("test_pin") == {"a": 2}


@pytest.mark.parametrize("versioned", [None, False])
def test_board_unversioned_pin_write_unversioned(versioned, board_unversioned):
board_unversioned.pin_write({"a": 1}, "test_pin", type="json", versioned=versioned)
Expand Down Expand Up @@ -346,9 +406,14 @@ def pin_name():

@pytest.fixture
def pin_del(board, df, pin_name):
meta_old = board.pin_write(df, pin_name, type="csv", title="some title")
sleep(1)
meta_new = board.pin_write(df, pin_name, type="csv", title="some title")
# 1min ago to avoid name collision
one_min_ago = datetime.now() - timedelta(minutes=1)
meta_old = board.pin_write(
df, pin_name, type="csv", title="some title", created=one_min_ago
)
meta_new = board.pin_write(
df, pin_name, type="csv", title="some title", force_identical_write=True
)

assert len(board.pin_versions(pin_name)) == 2
assert meta_old.version.version != meta_new.version.version
Expand All @@ -363,8 +428,22 @@ def pin_prune(board, df, pin_name):
two_days_ago = today - timedelta(days=2, minutes=1)

board.pin_write(df, pin_name, type="csv", title="some title", created=today)
board.pin_write(df, pin_name, type="csv", title="some title", created=day_ago)
board.pin_write(df, pin_name, type="csv", title="some title", created=two_days_ago)
board.pin_write(
df,
pin_name,
type="csv",
title="some title",
created=day_ago,
force_identical_write=True,
)
board.pin_write(
df,
pin_name,
type="csv",
title="some title",
created=two_days_ago,
force_identical_write=True,
)

versions = board.pin_versions(pin_name, as_df=False)
assert len(versions) == 3
Expand Down Expand Up @@ -546,7 +625,9 @@ def test_board_pin_search_admin_user(df, board_short, fs_admin): # noqa
@pytest.mark.fs_rsc
def test_board_rsc_pin_write_title_update(df, board_short):
board_short.pin_write(df, "susan/some_df", type="csv", title="title a")
board_short.pin_write(df, "susan/some_df", type="csv", title="title b")
board_short.pin_write(
df, "susan/some_df", type="csv", title="title b", force_identical_write=True
)

content = board_short.fs.info("susan/some_df")
assert content["title"] == "title b"
Expand Down