A git2rdata object consists of two files.
The ".tsv"
file contains the raw data as a plain text tab separated file.
The ".yml"
contains the metadata on the columns in plain text YAML format.
See vignette("plain text", package = "git2rdata")
for more details on the
implementation.
Usage
write_vc(
x,
file,
root = ".",
sorting,
strict = TRUE,
optimize = TRUE,
na = "NA",
...,
split_by
)
# S3 method for class 'character'
write_vc(
x,
file,
root = ".",
sorting,
strict = TRUE,
optimize = TRUE,
na = "NA",
...,
split_by = character(0)
)
# S3 method for class 'git_repository'
write_vc(
x,
file,
root,
sorting,
strict = TRUE,
optimize = TRUE,
na = "NA",
...,
stage = FALSE,
force = FALSE
)
Arguments
- x
the
data.frame
.- file
the name of the git2rdata object. Git2rdata objects cannot have dots in their name. The name may include a relative path.
file
is a path relative to theroot
. Note thatfile
must point to a location withinroot
.- root
The root of a project. Can be a file path or a
git-repository
. Defaults to the current working directory ("."
).- sorting
an optional vector of column names defining which columns to use for sorting
x
and in what order to use them. The default emptysorting
yields a warning. Addsorting
to avoid this warning. Strongly recommended in combination with version control. Seevignette("efficiency", package = "git2rdata")
for an illustration of the importance of sorting.- strict
What to do when the metadata changes.
strict = FALSE
overwrites the data and the metadata with a warning listing the changes,strict = TRUE
returns an error and leaves the data and metadata as is. Defaults toTRUE
.- optimize
If
TRUE
, recode the data to get smaller text files. IfFALSE
,meta()
converts the data to character. Defaults toTRUE
.- na
the string to use for missing values in the data.
- ...
parameters used in some methods
- split_by
An optional vector of variables name to split the text files. This creates a separate file for every combination. We prepend these variables to the vector of
sorting
variables.- stage
Logical value indicating whether to stage the changes after writing the data. Defaults to
FALSE
.- force
Add ignored files. Default is FALSE.
Value
a named vector with the file paths relative to root
. The names
contain the hashes of the files.
See also
Other storage:
display_metadata()
,
list_data()
,
prune_meta()
,
read_vc()
,
relabel()
,
rename_variable()
,
rm_data()
,
update_metadata()
,
verify_vc()
Examples
## on file system
# create a directory
root <- tempfile("git2rdata-")
dir.create(root)
# write a dataframe to the directory
write_vc(iris[1:6, ], file = "iris", root = root, sorting = "Sepal.Length")
#> 09d5bfd6a65e682a4ca030c766348180861568c8
#> "iris.tsv"
#> 0d434e56d22a710c99c5b912e8624d52abd41aaf
#> "iris.yml"
# check that a data file (.tsv) and a metadata file (.yml) exist.
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# read the git2rdata object from the directory
read_vc("iris", root)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 4.6 3.1 1.5 0.2 setosa
#> 2 4.7 3.2 1.3 0.2 setosa
#> 3 4.9 3.0 1.4 0.2 setosa
#> 4 5.0 3.6 1.4 0.2 setosa
#> 5 5.1 3.5 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#>
#> Use `display_metadata()` to view the metadata.
# store a new version with different observations but the same metadata
write_vc(iris[1:5, ], "iris", root)
#> 31ff841b58e569e8a4a4ac2f02152295c19f94db
#> "iris.tsv"
#> 0d434e56d22a710c99c5b912e8624d52abd41aaf
#> "iris.yml"
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# Removing a column requires version requires new metadata.
# Add strict = FALSE to override the existing metadata.
write_vc(
iris[1:6, -2], "iris", root, sorting = "Sepal.Length", strict = FALSE
)
#> Warning: Changes in the metadata may lead to unnecessarily large diffs.
#> See vignette('version_control', package = 'git2rdata') for more information.
#>
#> - New data has a different number of variables.
#> - Deleted variables: Sepal.Width.
#> b2098d507b0d749a86bb61a185ab2d31f7622418
#> "iris.tsv"
#> 274646434951b078260e194a51f349a30777ebf2
#> "iris.yml"
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# storing the orignal version again requires another update of the metadata
write_vc(iris[1:6, ], "iris", root, sorting = "Sepal.Width", strict = FALSE)
#> Warning: Changes in the metadata may lead to unnecessarily large diffs.
#> See vignette('version_control', package = 'git2rdata') for more information.
#>
#> - The sorting variables changed.
#> - Sorting for the new data: 'Sepal.Width'.
#> - Sorting for the old data: 'Sepal.Length'.
#> - New data has a different number of variables.
#> - New variables: Sepal.Width.
#> 4045436d3a61801f4eaad5769e32726838deecbc
#> "iris.tsv"
#> 928750d3071a23b52b05b88f0c2cb6f10b09789d
#> "iris.yml"
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml"
# optimize = FALSE stores the data more verbose. This requires larger files.
write_vc(
iris[1:6, ], "iris2", root, sorting = "Sepal.Width", optimize = FALSE
)
#> 79547bc5fecc2c82bd01988d1591130e578fdcf9
#> "iris2.csv"
#> 65cc08c8736657fd3e523180b46353de368b22d1
#> "iris2.yml"
list.files(root, recursive = TRUE)
#> [1] "iris.tsv" "iris.yml" "iris2.csv" "iris2.yml"
## on git repo using a git2r::git-repository
# initialise a git repo using the git2r package
repo_path <- tempfile("git2rdata-repo-")
dir.create(repo_path)
repo <- git2r::init(repo_path)
git2r::config(repo, user.name = "Alice", user.email = "alice@example.org")
# store a dataframe in git repo.
write_vc(iris[1:6, ], file = "iris", root = repo, sorting = "Sepal.Length")
#> 09d5bfd6a65e682a4ca030c766348180861568c8
#> "iris.tsv"
#> 0d434e56d22a710c99c5b912e8624d52abd41aaf
#> "iris.yml"
# This git2rdata object is not staged by default.
status(repo)
#> Untracked files:
#> Untracked: iris.tsv
#> Untracked: iris.yml
#>
# read a dataframe from a git repo
read_vc("iris", repo)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 4.6 3.1 1.5 0.2 setosa
#> 2 4.7 3.2 1.3 0.2 setosa
#> 3 4.9 3.0 1.4 0.2 setosa
#> 4 5.0 3.6 1.4 0.2 setosa
#> 5 5.1 3.5 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#>
#> Use `display_metadata()` to view the metadata.
# store a new version in the git repo and stage it in one go
write_vc(iris[1:5, ], "iris", repo, stage = TRUE)
#> 31ff841b58e569e8a4a4ac2f02152295c19f94db
#> "iris.tsv"
#> 0d434e56d22a710c99c5b912e8624d52abd41aaf
#> "iris.yml"
status(repo)
#> Staged changes:
#> New: iris.tsv
#> New: iris.yml
#>
# store a verbose version in a different gir2data object
write_vc(
iris[1:6, ], "iris2", repo, sorting = "Sepal.Width", optimize = FALSE
)
#> 79547bc5fecc2c82bd01988d1591130e578fdcf9
#> "iris2.csv"
#> 65cc08c8736657fd3e523180b46353de368b22d1
#> "iris2.yml"
status(repo)
#> Untracked files:
#> Untracked: iris2.csv
#> Untracked: iris2.yml
#>
#> Staged changes:
#> New: iris.tsv
#> New: iris.yml
#>