## Use K-medoid algorithm to find the suitable human model representitives

### Imports

In [1]:
import os, sys
lib_path = os.path.abspath(os.path.join('..', '..', 'utils'))
sys.path.append(lib_path)
from GraphType import GraphStat
import readCSV as reader
from scipy import stats
from ipywidgets import interact, fixed, interactive
import ipywidgets as widgets
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.utils.metric import distance_metric, type_metric
import random

### Define a new distance metric

In [2]:
def ks_value(dest1, dest2):
    value, p = stats.ks_2samp(dest1, dest2)
    return value


ks_metric = distance_metric(type_metric.USER_DEFINED, func=ks_value)

### Read Human Models

In [3]:
# Progress Widge
w = widgets.FloatProgress(
    value=0,
    min=0,
    max=1.0,
    step=0.1,
    description='Loading Files...:',
    bar_style='info',
    orientation='horizontal'
)


humanFiles = reader.readmultiplefiles('../input/human_output_100/', 1300, False)
modelToFileName = {}
for name in humanFiles:
    modelToFileName[GraphStat(name)] = name

models = list(modelToFileName.keys())
len(humanFiles)

304

### Find Representative by K-medroid for different dists on GraphStat

* Returns the index of the representative

In [4]:
def findRep(graphStats, func):
    out_ds = list(map(func, models))

    #choose a random starting point
    start_index = random.randint(0, len(out_ds))

    # start with one initial metrid [start_index]
    outdegree_kmedoid = kmedoids(out_ds, [start_index], metric=ks_metric)

    outdegree_kmedoid.process()
    centoids = outdegree_kmedoid.get_medoids()
    return centoids[0]

### Find representative for out degree

#### For all human models
* the rep found is ../input/humanOutput\R_20158_run_1.csv
* the average distance between it and others is 0.05515988287586802

#### For human models with $100 \pm 10$ nodes
* the rep found is ../input/human_output_100\R_2015225_run_1.csv
* the average distance between it and others is $0.046150929558524685$

In [5]:
od_rep_index = findRep(models, lambda m: m.out_d)
print(list(modelToFileName.values())[od_rep_index])
od_rep_model = models[od_rep_index]
print(modelToFileName[od_rep_model])


../input/human_output_100\R_2015225_run_1.csv
../input/human_output_100\R_2015225_run_1.csv


In [6]:
total_distance = 0
count = 0
for model in models:
    total_distance += ks_value(od_rep_model.out_d, model.out_d)
print(total_distance / len(models))

0.046150929558524685


### Find Representative for node activities

#### For all human models
* the rep found is ../input/humanOutput\R_2016176_run_1.csv
* the average distance between it and others is 0.05275267434589047

#### For human models with $100 \pm 10$ nodes
* the rep found is ../input/human_output_100\R_2017419_run_1.csv
* the average distance between it and others is $0.04679429311806747$

In [13]:
na_rep_index = findRep(models, lambda m: m.na)
print(list(modelToFileName.values())[na_rep_index])
na_rep_model = models[na_rep_index]
print(modelToFileName[na_rep_model])


../input/human_output_100\R_2017419_run_1.csv
../input/human_output_100\R_2017419_run_1.csv


In [14]:
total_distance = 0
count = 0
for model in models:
    total_distance += ks_value(na_rep_model.na, model.na)
print(total_distance / len(models))

0.04679429311806747


### Find Representative for MPC

#### For all human models
* the rep found is ../input/humanOutput\R_2015246_run_1.csv
* the average distance between it and others is 0.08556632702185384

#### For human models with $100 \pm 10$ nodes
* the rep found is ../input/human_output_100\R_2016324_run_1.csv
* the average distance between it and others is $0.07028909225833631$

In [16]:
mpc_rep_index = findRep(models, lambda m: m.mpc)
print(list(modelToFileName.values())[mpc_rep_index])
mpc_rep_model = models[mpc_rep_index]
print(modelToFileName[mpc_rep_model])

../input/human_output_100\R_2016324_run_1.csv
../input/human_output_100\R_2016324_run_1.csv


In [18]:
total_distance = 0
count = 0
for model in models:
    total_distance += ks_value(mpc_rep_model.mpc, model.mpc)
print(total_distance / len(models))

0.07028909225833631
