25  Create Tabular Data Package with R

Published

October 17, 2022

Warning

The code examples in this chapter must to be interpreted by R instead of Bash.

You need to have the R frictionless package installed. To install the package, run

install.packages('frictionless')

After install the package, load it and the tidyverse package:

library(tidyverse)
library(frictionless)

Load our data:

df <- read_csv('iris.csv')
df$Species <- as.factor(df$Species)

Create the Tabular Data Package in memory:

my_data_package <-
    create_package() %>%
    add_resource(
        resource_name = 'iris',
        data = df
    )

Inspect the Tabular Data Package:

str(my_data_package)
List of 3
 $ profile  : chr "tabular-data-package"
 $ resources:List of 1
  ..$ :List of 4
  .. ..$ name   : chr "iris"
  .. ..$ data   : spec_tbl_df [150 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
  .. .. ..$ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  .. .. ..$ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  .. .. ..$ Petal.Length: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  .. .. ..$ Petal.Width : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
  .. .. ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
  .. .. ..- attr(*, "spec")=
  .. .. .. .. cols(
  .. .. .. ..   Sepal.Length = col_double(),
  .. .. .. ..   Sepal.Width = col_double(),
  .. .. .. ..   Petal.Length = col_double(),
  .. .. .. ..   Petal.Width = col_double(),
  .. .. .. ..   Species = col_character()
  .. .. .. .. )
  .. .. ..- attr(*, "problems")=<externalptr> 
  .. ..$ profile: chr "tabular-data-resource"
  .. ..$ schema :List of 1
  .. .. ..$ fields:List of 5
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Sepal.Length"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Sepal.Width"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Petal.Length"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Petal.Width"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 3
  .. .. .. .. ..$ name       : chr "Species"
  .. .. .. .. ..$ type       : chr "string"
  .. .. .. .. ..$ constraints:List of 1
  .. .. .. .. .. ..$ enum: chr [1:3] "setosa" "versicolor" "virginica"
 $ directory: chr "."

Now, it’s time to share our Tabular Data Package. Write our Tabular Data Package to our disk:

write_package(
    my_data_package,
    directory = 'tabular-data-package/iris-r'
)
zip(
  zipfile = "tabular-data-package/iris-r.zip",
  files = c("tabular-data-package/iris-r/datapackage.json", "tabular-data-package/iris-r/iris.csv"),
  extras = "--junk-paths",
  zip = "gzip"  # This might not required in many machines
)

Last, deposit our Tabular Data Package on a research repository.

25.1 Improve Tabular Data Package

You can add more information to our Tabular Data Package:

my_data_package$title <- "Collection of Edgar Anderson's Iris Data"
my_data_package$description <- "Collection with the measurements in centimeters of the variables sepal length and width and petal length and width."
my_data_package$contributors <- list(
    role="author",
    title="Edgar Anderson"
)
my_data_package.licenses <- list(
    name="CC-BY-4.0",
    path="https://creativecommons.org/licenses/by/4.0/",
    title="Creative Commons Attribution 4.0"
)

Inspect the Tabular Data Package:

str(my_data_package)
List of 6
 $ profile     : chr "tabular-data-package"
 $ resources   :List of 1
  ..$ :List of 4
  .. ..$ name   : chr "iris"
  .. ..$ data   : spec_tbl_df [150 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
  .. .. ..$ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  .. .. ..$ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  .. .. ..$ Petal.Length: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  .. .. ..$ Petal.Width : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
  .. .. ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
  .. .. ..- attr(*, "spec")=
  .. .. .. .. cols(
  .. .. .. ..   Sepal.Length = col_double(),
  .. .. .. ..   Sepal.Width = col_double(),
  .. .. .. ..   Petal.Length = col_double(),
  .. .. .. ..   Petal.Width = col_double(),
  .. .. .. ..   Species = col_character()
  .. .. .. .. )
  .. .. ..- attr(*, "problems")=<externalptr> 
  .. ..$ profile: chr "tabular-data-resource"
  .. ..$ schema :List of 1
  .. .. ..$ fields:List of 5
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Sepal.Length"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Sepal.Width"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Petal.Length"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Petal.Width"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 3
  .. .. .. .. ..$ name       : chr "Species"
  .. .. .. .. ..$ type       : chr "string"
  .. .. .. .. ..$ constraints:List of 1
  .. .. .. .. .. ..$ enum: chr [1:3] "setosa" "versicolor" "virginica"
 $ directory   : chr "."
 $ title       : chr "Collection of Edgar Anderson's Iris Data"
 $ description : chr "Collection with the measurements in centimeters of the variables sepal length and width and petal length and width."
 $ contributors:List of 2
  ..$ role : chr "author"
  ..$ title: chr "Edgar Anderson"

25.2 Improve Tabular Data Resource

You can add more information to resources in our Tabular Data Package:

my_data_package$resources[[1]]$title <- "Edgar Anderson's Iris Data"
my_data_package$resources[[1]]$description <- "The measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica."

Inspect the Tabular Data Package:

str(my_data_package)
List of 6
 $ profile     : chr "tabular-data-package"
 $ resources   :List of 1
  ..$ :List of 6
  .. ..$ name       : chr "iris"
  .. ..$ data       : spec_tbl_df [150 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
  .. .. ..$ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  .. .. ..$ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  .. .. ..$ Petal.Length: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  .. .. ..$ Petal.Width : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
  .. .. ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
  .. .. ..- attr(*, "spec")=
  .. .. .. .. cols(
  .. .. .. ..   Sepal.Length = col_double(),
  .. .. .. ..   Sepal.Width = col_double(),
  .. .. .. ..   Petal.Length = col_double(),
  .. .. .. ..   Petal.Width = col_double(),
  .. .. .. ..   Species = col_character()
  .. .. .. .. )
  .. .. ..- attr(*, "problems")=<externalptr> 
  .. ..$ profile    : chr "tabular-data-resource"
  .. ..$ schema     :List of 1
  .. .. ..$ fields:List of 5
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Sepal.Length"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Sepal.Width"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Petal.Length"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 2
  .. .. .. .. ..$ name: chr "Petal.Width"
  .. .. .. .. ..$ type: chr "number"
  .. .. .. ..$ :List of 3
  .. .. .. .. ..$ name       : chr "Species"
  .. .. .. .. ..$ type       : chr "string"
  .. .. .. .. ..$ constraints:List of 1
  .. .. .. .. .. ..$ enum: chr [1:3] "setosa" "versicolor" "virginica"
  .. ..$ title      : chr "Edgar Anderson's Iris Data"
  .. ..$ description: chr "The measurements in centimeters of the variables sepal length and width and petal length and width, respectivel"| __truncated__
 $ directory   : chr "."
 $ title       : chr "Collection of Edgar Anderson's Iris Data"
 $ description : chr "Collection with the measurements in centimeters of the variables sepal length and width and petal length and width."
 $ contributors:List of 2
  ..$ role : chr "author"
  ..$ title: chr "Edgar Anderson"

25.3 Improve Table Schema

The R frictionless package already added a Table Schema to the resources in our Tabular Data Package, including the constraints in the column Species. If need, the Table Schema can be modified.

my_data_package$resources[[1]]$schema$fields[[1]]$title <- "Sepal's Length"
my_data_package$resources[[1]]$schema$fields[[2]]$title <- "Sepal's Width"
my_data_package$resources[[1]]$schema$fields[[3]]$title <- "Petal's Length"
my_data_package$resources[[1]]$schema$fields[[4]]$title <- "Petal's Width"

my_data_package$resources[[1]]$schema$fields[[1]]$description <- "Measurements in centimeters"
my_data_package$resources[[1]]$schema$fields[[2]]$description <- "Measurements in centimeters"
my_data_package$resources[[1]]$schema$fields[[3]]$description <- "Measurements in centimeters"
my_data_package$resources[[1]]$schema$fields[[4]]$description <- "Measurements in centimeters"

my_data_package$resources[[1]]$schema$fields[[5]]$title <- "Species"

Inspect the Tabular Data Package:

str(my_data_package)
List of 6
 $ profile     : chr "tabular-data-package"
 $ resources   :List of 1
  ..$ :List of 6
  .. ..$ name       : chr "iris"
  .. ..$ data       : spec_tbl_df [150 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
  .. .. ..$ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  .. .. ..$ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  .. .. ..$ Petal.Length: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  .. .. ..$ Petal.Width : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
  .. .. ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
  .. .. ..- attr(*, "spec")=
  .. .. .. .. cols(
  .. .. .. ..   Sepal.Length = col_double(),
  .. .. .. ..   Sepal.Width = col_double(),
  .. .. .. ..   Petal.Length = col_double(),
  .. .. .. ..   Petal.Width = col_double(),
  .. .. .. ..   Species = col_character()
  .. .. .. .. )
  .. .. ..- attr(*, "problems")=<externalptr> 
  .. ..$ profile    : chr "tabular-data-resource"
  .. ..$ schema     :List of 1
  .. .. ..$ fields:List of 5
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Sepal.Length"
  .. .. .. .. ..$ type       : chr "number"
  .. .. .. .. ..$ title      : chr "Sepal's Length"
  .. .. .. .. ..$ description: chr "Measurements in centimeters"
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Sepal.Width"
  .. .. .. .. ..$ type       : chr "number"
  .. .. .. .. ..$ title      : chr "Sepal's Width"
  .. .. .. .. ..$ description: chr "Measurements in centimeters"
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Petal.Length"
  .. .. .. .. ..$ type       : chr "number"
  .. .. .. .. ..$ title      : chr "Petal's Length"
  .. .. .. .. ..$ description: chr "Measurements in centimeters"
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Petal.Width"
  .. .. .. .. ..$ type       : chr "number"
  .. .. .. .. ..$ title      : chr "Petal's Width"
  .. .. .. .. ..$ description: chr "Measurements in centimeters"
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Species"
  .. .. .. .. ..$ type       : chr "string"
  .. .. .. .. ..$ constraints:List of 1
  .. .. .. .. .. ..$ enum: chr [1:3] "setosa" "versicolor" "virginica"
  .. .. .. .. ..$ title      : chr "Species"
  .. ..$ title      : chr "Edgar Anderson's Iris Data"
  .. ..$ description: chr "The measurements in centimeters of the variables sepal length and width and petal length and width, respectivel"| __truncated__
 $ directory   : chr "."
 $ title       : chr "Collection of Edgar Anderson's Iris Data"
 $ description : chr "Collection with the measurements in centimeters of the variables sepal length and width and petal length and width."
 $ contributors:List of 2
  ..$ role : chr "author"
  ..$ title: chr "Edgar Anderson"

25.4 Replace Data Resource Location

You can replace local resources with their copy in the internet:

my_data_package$resources[[1]]$data <- NULL
my_data_package$resources[[1]]$path <- "https://zenodo.org/record/1319069/files/iris.csv?download=1"

Inspect the Tabular Data Package:

str(my_data_package)
List of 6
 $ profile     : chr "tabular-data-package"
 $ resources   :List of 1
  ..$ :List of 6
  .. ..$ name       : chr "iris"
  .. ..$ profile    : chr "tabular-data-resource"
  .. ..$ schema     :List of 1
  .. .. ..$ fields:List of 5
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Sepal.Length"
  .. .. .. .. ..$ type       : chr "number"
  .. .. .. .. ..$ title      : chr "Sepal's Length"
  .. .. .. .. ..$ description: chr "Measurements in centimeters"
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Sepal.Width"
  .. .. .. .. ..$ type       : chr "number"
  .. .. .. .. ..$ title      : chr "Sepal's Width"
  .. .. .. .. ..$ description: chr "Measurements in centimeters"
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Petal.Length"
  .. .. .. .. ..$ type       : chr "number"
  .. .. .. .. ..$ title      : chr "Petal's Length"
  .. .. .. .. ..$ description: chr "Measurements in centimeters"
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Petal.Width"
  .. .. .. .. ..$ type       : chr "number"
  .. .. .. .. ..$ title      : chr "Petal's Width"
  .. .. .. .. ..$ description: chr "Measurements in centimeters"
  .. .. .. ..$ :List of 4
  .. .. .. .. ..$ name       : chr "Species"
  .. .. .. .. ..$ type       : chr "string"
  .. .. .. .. ..$ constraints:List of 1
  .. .. .. .. .. ..$ enum: chr [1:3] "setosa" "versicolor" "virginica"
  .. .. .. .. ..$ title      : chr "Species"
  .. ..$ title      : chr "Edgar Anderson's Iris Data"
  .. ..$ description: chr "The measurements in centimeters of the variables sepal length and width and petal length and width, respectivel"| __truncated__
  .. ..$ path       : chr "https://zenodo.org/record/1319069/files/iris.csv?download=1"
 $ directory   : chr "."
 $ title       : chr "Collection of Edgar Anderson's Iris Data"
 $ description : chr "Collection with the measurements in centimeters of the variables sepal length and width and petal length and width."
 $ contributors:List of 2
  ..$ role : chr "author"
  ..$ title: chr "Edgar Anderson"

Write our Tabular Data Package to our disk:

write_package(
    my_data_package,
    directory = 'tabular-data-package/iris-r'
)

And last, let’s deposit our Tabular Data Package on a research repository.