In-Class Exercise 05

Author

Lau Jia Yi

Published

May 17, 2025

Modified

May 17, 2025

pacman::p_load(tidyverse, jsonlite, SmartEDA, tidygraph, ggraph)

Load json file

Use fromJSON() of jsonlite to import MC1_graph.json file into R and save the outpout object as kg.

kg <- fromJSON("MC1/data/MC1_graph.json")

Inspect structure

str(kg, max.level = 3)
List of 5
 $ directed  : logi TRUE
 $ multigraph: logi TRUE
 $ graph     :List of 2
  ..$ node_default: Named list()
  ..$ edge_default: Named list()
 $ nodes     :'data.frame': 17412 obs. of  10 variables:
  ..$ Node Type     : chr [1:17412] "Song" "Person" "Person" "Person" ...
  ..$ name          : chr [1:17412] "Breaking These Chains" "Carlos Duffy" "Min Qin" "Xiuying Xie" ...
  ..$ single        : logi [1:17412] TRUE NA NA NA NA FALSE ...
  ..$ release_date  : chr [1:17412] "2017" NA NA NA ...
  ..$ genre         : chr [1:17412] "Oceanus Folk" NA NA NA ...
  ..$ notable       : logi [1:17412] TRUE NA NA NA NA TRUE ...
  ..$ id            : int [1:17412] 0 1 2 3 4 5 6 7 8 9 ...
  ..$ written_date  : chr [1:17412] NA NA NA NA ...
  ..$ stage_name    : chr [1:17412] NA NA NA NA ...
  ..$ notoriety_date: chr [1:17412] NA NA NA NA ...
 $ links     :'data.frame': 37857 obs. of  4 variables:
  ..$ Edge Type: chr [1:37857] "InterpolatesFrom" "RecordedBy" "PerformerOf" "ComposerOf" ...
  ..$ source   : int [1:37857] 0 0 1 1 2 2 3 5 5 5 ...
  ..$ target   : int [1:37857] 1841 4 0 16180 0 16180 0 5088 14332 11677 ...
  ..$ key      : int [1:37857] 0 0 0 0 0 0 0 0 0 0 ...

Extract and inspect

nodes_tbl <- as_tibble(kg$nodes)
edges_tbl <- as_tibble(kg$links)

Initial EDA

ggplot(data = edges_tbl,
       aes(y = `Edge Type`)) +
  geom_bar()

ggplot(data = nodes_tbl,
       aes(y = `Node Type`)) +
  geom_bar()

Creating Knowledge Graph

Step 1 Mapping from node id to row index

id_map <- tibble(id = nodes_tbl$id,
                  index = seq_len(
                    nrow(nodes_tbl)))

This ensures each id from your node list is mapped to the correct row number.

Step 2: Map source and target IDs to row indices

edges_tbl <- edges_tbl %>%
  left_join(id_map, by = c("source" = "id")) %>%
  rename(from = index) %>%
  left_join(id_map, by = c("target" = "id")) %>%
  rename(to = index)
edges_tbl <- edges_tbl %>%
  left_join(id_map, by = c("source" = "id"), suffix = c("", "_source")) %>%
  rename(from = index) %>%
  left_join(id_map, by = c("target" = "id"), suffix = c("", "_target")) %>%
  rename(to = index)

Step 3: Filter out any unmatched invalid edges

edges_tbl <- edges_tbl %>%
  filter(!is.na(from) & !is.na(to))

Step 4: Creating the graph

Lastly, tbl_graph() is used to create tidygraph’s graph object by using the code chunk below.

graph <- tbl_graph(nodes = nodes_tbl,
                   edges = edges_tbl,
                   directed = kg$directed)

Visualising the knowledge graph

set.seed(1234)

Visualising the whole graph

ggraph(graph, layout = "fr") +
  geom_edge_link(alpha = 0.3,
                 colour = "gray") +
  geom_node_point(aes(color = `Node Type`),
                  size = 4) +
  geom_node_text(aes(label = name),
                 repel = TRUE,
                 size = 2.5) +
  theme_void()

Visualising the sub-graph

Step 1: Filter edges to only MemberOf

graph_memberof <- graph %>%
  activate(edges) %>%
  filter(`Edge Type` == "MemberOf")

17,412 elements will still remain, as nodes of those not applicable to MemberOf still remain in grapH_memberof.

Step 2: Extract only connected nodes (i.e., used in these edges)

used_node_indices <- graph_memberof %>%
  activate(edges) %>%
  as_tibble() %>%
  select(from, to) %>%
  unlist() %>%
  unique()

The code above then removes other nodes not used in the edges of MemberOf

Step 3: Keep only those nodes

graph_memberof <- graph_memberof %>%
  activate(nodes) %>%
  mutate(row_id = row_number()) %>%
  filter(row_id %in% used_node_indices) %>%
  select(-row_id) # optional cleanup
ggraph(graph_memberof, layout = "fr") +
  geom_edge_link(alpha = 0.5,
                 colour = "gray") +
  geom_node_point(aes(color = `Node Type`),
                  size = 1) +
  geom_node_text(aes(label = name),
                 repel = TRUE,
                 size = 2.5) +
  theme_void()