Skip to contents

Let us initialize a dataset from Wikidata.

library(dataset)
library(wbdataset)
data("wikidata_countries_df")
wikidata_countries <- c(
  "http://www.wikidata.org/entity/Q756617", "http://www.wikidata.org/entity/Q347",
  "http://www.wikidata.org/entity/Q3908",   "http://www.wikidata.org/entity/Q1246")
wikidata_countries_df <- get_item(qid=wikidata_countries,
                                  language="en",
                                  title = "European countries",
                                  creator = person("Daniel", "Antal"))
print(wikidata_countries_df)
#> Antal D (2024). "European countries."
#>   qid        label              description                             language
#>   <hvn_lbl_> <hvn_lbl_>         <hvn_lbl_>                              <hvn_lb>
#> 1 Q756617    Kingdom of Denmark Kingdom of Denmark and its autonomous … en      
#> 2 Q347       Liechtenstein      country in Central Europe               en      
#> 3 Q3908      Galicia            autonomous community of Spain           en      
#> 4 Q1246      Kosovo             country in southeastern Europe          en

The provenance and the definition of the key qid column is well described in the attributes. Now let us add further columns, making sure that we include the precise definition of each of the variables.

ds <- wikidata_countries_df %>%
  left_join_column( 
    label = "ISO 3166-1 alpha-2 code", 
    property = "P297" ) %>%
  left_join_column( 
    property = "P1566", 
    label = "Geonames ID",
    namespace = "https://www.geonames.org/") %>%
  left_join_column( 
    label = "different from Wikipedia item",
    property = "P1889")
#> Left join claims: 1/4: Q756617 P297
#> Left join claims: 2/4: Q347 P297
#> Left join claims: 3/4: Q3908 P297
#> Left join claims: 4/4: Q1246 P297
#> Left join claims: 1/4: Q756617 P1566
#> Left join claims: 2/4: Q347 P1566
#> Left join claims: 3/4: Q3908 P1566
#> Left join claims: 4/4: Q1246 P1566
#> Left join claims: 1/4: Q756617 P1889
#> Left join claims: 2/4: Q347 P1889
#> Left join claims: 3/4: Q3908 P1889
#> Left join claims: 4/4: Q1246 P1889
print(ds)
#> Antal D (2024). "European countries."
#>   qid        label              description           language P297  P1566 P1889
#>   <hvn_lbl_> <hvn_lbl_>         <hvn_lbl_>            <hvn_lb> <hvn> <hvn> <hvn>
#> 1 Q756617    Kingdom of Denmark Kingdom of Denmark a… en       DK    NA    Q35  
#> 2 Q347       Liechtenstein      country in Central E… en       LI    3042… NA   
#> 3 Q3908      Galicia            autonomous community… en       NA    3336… Q180…
#> 4 Q1246      Kosovo             country in southeast… en       XK    8310… Q1231
attributes(ds$P1566)
#> $label
#> [1] "Geonames ID"
#> 
#> $namespace
#> [1] "https://www.geonames.org/"
#> 
#> $class
#> [1] "haven_labelled_defined" "haven_labelled"         "vctrs_vctr"            
#> [4] "character"

This resolves the third cell in the P1566 column (Geonames ID of Galicia) to https://www.geonames.org/3336902. Galicia is not a sovereign state, therefore it has no P297 value, i.e., it has no ISO country code.

print(dataset::get_bibentry(ds), style = "bibtex")
#> @Misc{,
#>   title = {European countries},
#>   author = {Daniel Antal},
#>   publisher = {:unas},
#>   year = {2024},
#>   resourcetype = {Dataset},
#>   version = {0.1.0},
#>   description = {:unas},
#>   language = {:unas},
#>   format = {application/r-rds},
#>   rights = {:unas},
#> }

Some provenance is recorded:

dataset::provenance(ds)
#> NULL