How to analyze Set Data

Exploratory Data Analysis (EDA) is nice and easy, if the entities you look at are of one of the typical levels of measurement (see below). But if one feature is a set, it becomes harder. In this post, I want to show a couple of possibilities.

Level of measurement

If you have features of this level of measurement.

	Scale
	Qualitative (categorical)		Quantitative (metric)
	Nominal-	Ordinal-	Intervall-	Ratio-	Absolute
Empirical relations	Equivalence	Equivalence order	Equivalence order emp. addition	Equivalence order emp. addition emp. multipliation	Equivalence order emp. addition emp. multipliation
Allowed transformationen	m' = f(m) f bijektive	m' = f(m) f strictly monotonous	m' = am+b with a>0	m' = am with a>0	m' = m
Examples of this scale	Telephone numbers, license plates, types, postal codes, gender	Grades, Degrees of hardness, wind force	Temp. in °C, °F, calendar time, geographic height	Mass, length, el. current	Number of particles, number of errors
Values of m	numbers, names, symbols	usually natural numbers	usually real numbers	usually real numbers > 0	usually natural numbers

Datasets

dblp

dblp is a bibliography website which contains publication data from almost 2 million publications.

Use DBLPParser to create a CSV file.

from collections import Counter
import numpy as np
import pandas as pd
import progressbar
import networkx as nx
from itertools import combinations

import clana.io
import clana.visualize_cm

# Load the data
df = pd.read_csv("articles.csv")
df["author"] = df["author"].str.split("::")

# Analyze the data
df = df[~df["author"].isna()]
authors = [author for authorset in df["author"].tolist() for author in authorset]
author_count = Counter(authors)

print("* Publications: {}".format(len(df)))
print("* Unique elements: {}".format(len(author_count)))
print("* Most common:")
most_common = sorted(author_count.items(), key=lambda n: n[1], reverse=True)
for name, count in most_common[:10]:
    print("    {:>4}x {}".format(count, name))

unique_authors = sorted(list(author_count.keys()))


def get_biggest_clusters(edges, n=10):
    G = nx.Graph()
    for authorset in edges.tolist():
        for author in authorset:
            G.add_node(author)

    for authorset in progressbar.progressbar(df["author"].tolist()[:10_000]):
        for author1, author2 in combinations(authorset, 2):
            G.add_edge(author1, author2)

    print("Edges were added")

    components = [c for c in sorted(nx.connected_components(G), key=len, reverse=True)]
    return components[:n]


def create_matrix(nodes, edges):
    n2i = dict([(node, i) for i, node in enumerate(sorted(nodes))])
    # node to index
    mat = np.zeros((len(nodes), len(nodes)), dtype=np.int32)
    for edge in edges:
        for a, b in combinations(edge, 2):
            if a not in n2i or b not in n2i:
                continue
            mat[n2i[a]][n2i[b]] += 1
            if a != b:
                mat[n2i[b]][n2i[a]] += 1
    return mat, sorted(nodes)


components = get_biggest_clusters(df["author"])
print("* Biggest clusters: {}".format([len(el) for el in components]))

component_w_publications = [(author, author_count[author]) for author in components[0]]
component_w_publications = sorted(
    component_w_publications, key=lambda n: n[1], reverse=True
)
authors = [author for author, count in component_w_publications[:1_00]]
mat, labels = create_matrix(authors, df["author"].tolist())

clana.visualize_cm.main(
    "coauthors.json",
    perm_file="",
    steps=1_000_000,
    labels_file="labels.json",
    zero_diagonal=False,
    output="cm-ordered.pdf",
)
clana.io.write_cm("coauthors.json", mat)
clana.io.write_labels("labels.json", labels)

Results:

* Publications: 2,054,474
* Unique elements: 1,475,717
* Most common
    1181x H. Vincent Poor
     789x Lajos Hanzo
     767x Witold Pedrycz
     747x Mohamed-Slim Alouini
     615x Chin-Chen Chang 0001
     607x Dacheng Tao
     591x Victor C. M. Leung
     570x Wei Zhang
     562x Wei Li
     554x Wei Wang
* Biggest clusters (under first 10^6 publications): [761987, 52, 45, 44, 32, 31, 29, 28, 28, 28]

Then you can apply confusion matrix ordering to find authors who often work together (click on it to see large version):

The CMO technique is described in

Thoma, Martin. "Analysis and optimization of convolutional neural network architectures." arXiv preprint arXiv:1707.09725 (2017). Chapter 5.2.

MovieLens 20M

from collections import Counter
import numpy as np
import pandas as pd
import progressbar
import networkx as nx
from itertools import combinations

import clana.io
import clana.visualize_cm

# Load the data
df = pd.read_csv("movies.csv")
df["genres"] = df["genres"].str.split("|")

# Analyze the data
list_values = [value for valueset in df["genres"].tolist() for value in valueset]
value_count = Counter(list_values)

print("* Movies: {}".format(len(df)))
print("* Unique genres: {}".format(len(value_count)))
print("* Most common:")
most_common = sorted(value_count.items(), key=lambda n: n[1], reverse=True)
for name, count in most_common[:10]:
    print("    {:>4}x {}".format(count, name))

unique_genres = sorted(list(value_count.keys()))


def get_biggest_clusters(edges, n=10):
    G = nx.Graph()
    for authorset in edges.tolist():
        for author in authorset:
            G.add_node(author)

    for authorset in progressbar.progressbar(df["genres"].tolist()[:10_000]):
        for author1, author2 in combinations(authorset, 2):
            G.add_edge(author1, author2)

    print("Edges were added")

    components = [c for c in sorted(nx.connected_components(G), key=len, reverse=True)]
    return components[:n]


def create_matrix(nodes, edges):
    n2i = dict([(node, i) for i, node in enumerate(sorted(nodes))])
    # node to index
    mat = np.zeros((len(nodes), len(nodes)), dtype=np.int32)
    for edge in edges:
        for a, b in combinations(edge, 2):
            if a not in n2i or b not in n2i:
                continue
            mat[n2i[a]][n2i[b]] += 1
            if a != b:
                mat[n2i[b]][n2i[a]] += 1
    return mat, sorted(nodes)


components = get_biggest_clusters(df["genres"])
print("* Biggest clusters: {}".format([len(el) for el in components]))

component_w_publications = [(author, value_count[author]) for author in components[0]]
component_w_publications = sorted(
    component_w_publications, key=lambda n: n[1], reverse=True
)
authors = [author for author, count in component_w_publications[:1_00]]
mat, labels = create_matrix(authors, df["genres"].tolist())

clana.io.write_cm("genre-combinations.json", mat)
clana.io.write_labels("labels.json", labels)
clana.visualize_cm.main(
    "genre-combinations.json",
    perm_file="",
    steps=1_000_000,
    labels_file="labels.json",
    zero_diagonal=False,
    output="cm-genre-combinations.pdf",
)

Results:

* Movies: 27278
* Unique genres: 20
* Most common:
    13344x Drama
     8374x Comedy
     4178x Thriller
     4127x Romance
     3520x Action
     2939x Crime
     2611x Horror
     2471x Documentary
     2329x Adventure
     1743x Sci-Fi

CMO:

How to analyze Set Data

Level of measurement

Datasets

dblp

MovieLens 20M

See also

Published

Category

Tags

Contact