Exploratory Data Analysis (EDA) is nice and easy, if the entities you look at are of one of the typical levels of measurement (see below). But if one feature is a set, it becomes harder. In this post, I want to show a couple of possibilities.
Level of measurement ¶
If you have features of this level of measurement.
Scale | |||||
---|---|---|---|---|---|
Qualitative (categorical) | Quantitative (metric) | ||||
Nominal- | Ordinal- | Intervall- | Ratio- | Absolute | |
Empirical relations | Equivalence | Equivalence order |
Equivalence order emp. addition |
Equivalence order emp. addition emp. multipliation |
Equivalence order emp. addition emp. multipliation |
Allowed transformationen | m' = f(m) f bijektive |
m' = f(m) f strictly monotonous |
m' = am+b with a>0 |
m' = am with a>0 |
m' = m |
Examples of this scale | Telephone numbers, license plates, types, postal codes, gender | Grades, Degrees of hardness, wind force | Temp. in °C, °F, calendar time, geographic height | Mass, length, el. current | Number of particles, number of errors |
Values of m | numbers, names, symbols | usually natural numbers | usually real numbers | usually real numbers > 0 | usually natural numbers |
Datasets ¶
dblp ¶
dblp is a bibliography website which contains publication data from almost 2 million publications.
Use DBLPParser to create a CSV file.
from collections import Counter
import numpy as np
import pandas as pd
import progressbar
import networkx as nx
from itertools import combinations
import clana.io
import clana.visualize_cm
# Load the data
df = pd.read_csv("articles.csv")
df["author"] = df["author"].str.split("::")
# Analyze the data
df = df[~df["author"].isna()]
authors = [author for authorset in df["author"].tolist() for author in authorset]
author_count = Counter(authors)
print("* Publications: {}".format(len(df)))
print("* Unique elements: {}".format(len(author_count)))
print("* Most common:")
most_common = sorted(author_count.items(), key=lambda n: n[1], reverse=True)
for name, count in most_common[:10]:
print(" {:>4}x {}".format(count, name))
unique_authors = sorted(list(author_count.keys()))
def get_biggest_clusters(edges, n=10):
G = nx.Graph()
for authorset in edges.tolist():
for author in authorset:
G.add_node(author)
for authorset in progressbar.progressbar(df["author"].tolist()[:10_000]):
for author1, author2 in combinations(authorset, 2):
G.add_edge(author1, author2)
print("Edges were added")
components = [c for c in sorted(nx.connected_components(G), key=len, reverse=True)]
return components[:n]
def create_matrix(nodes, edges):
n2i = dict([(node, i) for i, node in enumerate(sorted(nodes))])
# node to index
mat = np.zeros((len(nodes), len(nodes)), dtype=np.int32)
for edge in edges:
for a, b in combinations(edge, 2):
if a not in n2i or b not in n2i:
continue
mat[n2i[a]][n2i[b]] += 1
if a != b:
mat[n2i[b]][n2i[a]] += 1
return mat, sorted(nodes)
components = get_biggest_clusters(df["author"])
print("* Biggest clusters: {}".format([len(el) for el in components]))
component_w_publications = [(author, author_count[author]) for author in components[0]]
component_w_publications = sorted(
component_w_publications, key=lambda n: n[1], reverse=True
)
authors = [author for author, count in component_w_publications[:1_00]]
mat, labels = create_matrix(authors, df["author"].tolist())
clana.visualize_cm.main(
"coauthors.json",
perm_file="",
steps=1_000_000,
labels_file="labels.json",
zero_diagonal=False,
output="cm-ordered.pdf",
)
clana.io.write_cm("coauthors.json", mat)
clana.io.write_labels("labels.json", labels)
Results:
* Publications: 2,054,474
* Unique elements: 1,475,717
* Most common
1181x H. Vincent Poor
789x Lajos Hanzo
767x Witold Pedrycz
747x Mohamed-Slim Alouini
615x Chin-Chen Chang 0001
607x Dacheng Tao
591x Victor C. M. Leung
570x Wei Zhang
562x Wei Li
554x Wei Wang
* Biggest clusters (under first 10^6 publications): [761987, 52, 45, 44, 32, 31, 29, 28, 28, 28]
Then you can apply confusion matrix ordering to find authors who often work together (click on it to see large version):

The CMO technique is described in
Thoma, Martin. "Analysis and optimization of convolutional neural network architectures." arXiv preprint arXiv:1707.09725 (2017). Chapter 5.2.
MovieLens 20M ¶
from collections import Counter
import numpy as np
import pandas as pd
import progressbar
import networkx as nx
from itertools import combinations
import clana.io
import clana.visualize_cm
# Load the data
df = pd.read_csv("movies.csv")
df["genres"] = df["genres"].str.split("|")
# Analyze the data
list_values = [value for valueset in df["genres"].tolist() for value in valueset]
value_count = Counter(list_values)
print("* Movies: {}".format(len(df)))
print("* Unique genres: {}".format(len(value_count)))
print("* Most common:")
most_common = sorted(value_count.items(), key=lambda n: n[1], reverse=True)
for name, count in most_common[:10]:
print(" {:>4}x {}".format(count, name))
unique_genres = sorted(list(value_count.keys()))
def get_biggest_clusters(edges, n=10):
G = nx.Graph()
for authorset in edges.tolist():
for author in authorset:
G.add_node(author)
for authorset in progressbar.progressbar(df["genres"].tolist()[:10_000]):
for author1, author2 in combinations(authorset, 2):
G.add_edge(author1, author2)
print("Edges were added")
components = [c for c in sorted(nx.connected_components(G), key=len, reverse=True)]
return components[:n]
def create_matrix(nodes, edges):
n2i = dict([(node, i) for i, node in enumerate(sorted(nodes))])
# node to index
mat = np.zeros((len(nodes), len(nodes)), dtype=np.int32)
for edge in edges:
for a, b in combinations(edge, 2):
if a not in n2i or b not in n2i:
continue
mat[n2i[a]][n2i[b]] += 1
if a != b:
mat[n2i[b]][n2i[a]] += 1
return mat, sorted(nodes)
components = get_biggest_clusters(df["genres"])
print("* Biggest clusters: {}".format([len(el) for el in components]))
component_w_publications = [(author, value_count[author]) for author in components[0]]
component_w_publications = sorted(
component_w_publications, key=lambda n: n[1], reverse=True
)
authors = [author for author, count in component_w_publications[:1_00]]
mat, labels = create_matrix(authors, df["genres"].tolist())
clana.io.write_cm("genre-combinations.json", mat)
clana.io.write_labels("labels.json", labels)
clana.visualize_cm.main(
"genre-combinations.json",
perm_file="",
steps=1_000_000,
labels_file="labels.json",
zero_diagonal=False,
output="cm-genre-combinations.pdf",
)
Results:
* Movies: 27278
* Unique genres: 20
* Most common:
13344x Drama
8374x Comedy
4178x Thriller
4127x Romance
3520x Action
2939x Crime
2611x Horror
2471x Documentary
2329x Adventure
1743x Sci-Fi
CMO:
