Skip to contents

Hellinger distances, either pairwise within a single tidied topic model dataframe or between two tidied topic model dataframes

Usage

# S3 method for data.frame
hellinger(
  topicsdf1,
  id1 = "document",
  cat1 = "topic",
  prob1 = "prob",
  topicsdf2 = NULL,
  id2 = "document",
  cat2 = "topic",
  prob2 = "prob",
  df = FALSE
)

Arguments

df

Should the function return the matrix of Hellinger distances (default) or a tidy dataframe?

topicsdf1, topicsdf2

Tidied topic model dataframes

id1, id2

Unit identifiers (DOIs, auids, ORU name, etc.)

cat1, cat2

Category identifiers (topics)

prob1, prob2

Probability values (gamma)

Value

matrix or tidy dataframe (default) of Hellinger distances

Examples

set.seed(2022-06-09)
topics1 = rdirichlet(3, rep(5, 5)) |>
    tibble::as_tibble(rownames = 'doc_id',
                      .name_repair = tmfast:::make_colnames) |>
    dplyr::mutate(doc_id = stringr::str_c('doc_', doc_id)) |>
    tidyr::pivot_longer(tidyselect::starts_with('V'),
                        names_to = 'topic',
                        values_to = 'gamma')
topics2 = rdirichlet(3, rep(5, 5)) |>
    tibble::as_tibble(rownames = 'doc_id',
                      .name_repair = tmfast:::make_colnames) |>
    dplyr::mutate(doc_id = stringr::str_c('doc_', as.integer(doc_id) + 5)) |>
    tidyr::pivot_longer(tidyselect::starts_with('V'),
                        names_to = 'topic',
                        values_to = 'gamma')
hellinger(topics1, doc_id, prob1 = 'gamma', df = TRUE)
#> # A tibble: 9 × 3
#>   doc_id document         dist
#>   <chr>  <chr>           <dbl>
#> 1 doc_1  doc_1    0.0000000105
#> 2 doc_1  doc_2    0.307       
#> 3 doc_1  doc_3    0.267       
#> 4 doc_2  doc_1    0.307       
#> 5 doc_2  doc_2    0           
#> 6 doc_2  doc_3    0.123       
#> 7 doc_3  doc_1    0.267       
#> 8 doc_3  doc_2    0.123       
#> 9 doc_3  doc_3    0           
hellinger(topics1, doc_id, prob1 = 'gamma',
          topicsdf2 = topics2, id2 = doc_id, prob2 = 'gamma')
#>           doc_6     doc_7      doc_8
#> doc_1 0.2361547 0.2632094 0.33018266
#> doc_2 0.1777705 0.1060308 0.12270871
#> doc_3 0.1296687 0.1732766 0.08788004