::p_load(tidyverse, jsonlite, SmartEDA, tidygraph, ggraph) pacman
In-Class Exercise 05
Load json file
Use fromJSON() of jsonlite to import MC1_graph.json file into R and save the outpout object as kg.
<- fromJSON("MC1/data/MC1_graph.json") kg
Inspect structure
str(kg, max.level = 3)
List of 5
$ directed : logi TRUE
$ multigraph: logi TRUE
$ graph :List of 2
..$ node_default: Named list()
..$ edge_default: Named list()
$ nodes :'data.frame': 17412 obs. of 10 variables:
..$ Node Type : chr [1:17412] "Song" "Person" "Person" "Person" ...
..$ name : chr [1:17412] "Breaking These Chains" "Carlos Duffy" "Min Qin" "Xiuying Xie" ...
..$ single : logi [1:17412] TRUE NA NA NA NA FALSE ...
..$ release_date : chr [1:17412] "2017" NA NA NA ...
..$ genre : chr [1:17412] "Oceanus Folk" NA NA NA ...
..$ notable : logi [1:17412] TRUE NA NA NA NA TRUE ...
..$ id : int [1:17412] 0 1 2 3 4 5 6 7 8 9 ...
..$ written_date : chr [1:17412] NA NA NA NA ...
..$ stage_name : chr [1:17412] NA NA NA NA ...
..$ notoriety_date: chr [1:17412] NA NA NA NA ...
$ links :'data.frame': 37857 obs. of 4 variables:
..$ Edge Type: chr [1:37857] "InterpolatesFrom" "RecordedBy" "PerformerOf" "ComposerOf" ...
..$ source : int [1:37857] 0 0 1 1 2 2 3 5 5 5 ...
..$ target : int [1:37857] 1841 4 0 16180 0 16180 0 5088 14332 11677 ...
..$ key : int [1:37857] 0 0 0 0 0 0 0 0 0 0 ...
Extract and inspect
<- as_tibble(kg$nodes)
nodes_tbl <- as_tibble(kg$links) edges_tbl
Initial EDA
ggplot(data = edges_tbl,
aes(y = `Edge Type`)) +
geom_bar()
ggplot(data = nodes_tbl,
aes(y = `Node Type`)) +
geom_bar()
Creating Knowledge Graph
Step 1 Mapping from node id to row index
<- tibble(id = nodes_tbl$id,
id_map index = seq_len(
nrow(nodes_tbl)))
This ensures each id from your node list is mapped to the correct row number.
Step 2: Map source and target IDs to row indices
<- edges_tbl %>%
edges_tbl left_join(id_map, by = c("source" = "id")) %>%
rename(from = index) %>%
left_join(id_map, by = c("target" = "id")) %>%
rename(to = index)
<- edges_tbl %>%
edges_tbl left_join(id_map, by = c("source" = "id"), suffix = c("", "_source")) %>%
rename(from = index) %>%
left_join(id_map, by = c("target" = "id"), suffix = c("", "_target")) %>%
rename(to = index)
Step 3: Filter out any unmatched invalid edges
<- edges_tbl %>%
edges_tbl filter(!is.na(from) & !is.na(to))
Step 4: Creating the graph
Lastly, tbl_graph() is used to create tidygraph’s graph object by using the code chunk below.
<- tbl_graph(nodes = nodes_tbl,
graph edges = edges_tbl,
directed = kg$directed)
Visualising the knowledge graph
set.seed(1234)
Visualising the whole graph
ggraph(graph, layout = "fr") +
geom_edge_link(alpha = 0.3,
colour = "gray") +
geom_node_point(aes(color = `Node Type`),
size = 4) +
geom_node_text(aes(label = name),
repel = TRUE,
size = 2.5) +
theme_void()
Visualising the sub-graph
Step 1: Filter edges to only MemberOf
<- graph %>%
graph_memberof activate(edges) %>%
filter(`Edge Type` == "MemberOf")
17,412 elements will still remain, as nodes of those not applicable to MemberOf still remain in grapH_memberof.
Step 2: Extract only connected nodes (i.e., used in these edges)
<- graph_memberof %>%
used_node_indices activate(edges) %>%
as_tibble() %>%
select(from, to) %>%
unlist() %>%
unique()
The code above then removes other nodes not used in the edges of MemberOf
Step 3: Keep only those nodes
<- graph_memberof %>%
graph_memberof activate(nodes) %>%
mutate(row_id = row_number()) %>%
filter(row_id %in% used_node_indices) %>%
select(-row_id) # optional cleanup
ggraph(graph_memberof, layout = "fr") +
geom_edge_link(alpha = 0.5,
colour = "gray") +
geom_node_point(aes(color = `Node Type`),
size = 1) +
geom_node_text(aes(label = name),
repel = TRUE,
size = 2.5) +
theme_void()