import pickle
import re
import pandas as pd
import numpy as np
# for the stats
import pymc as pm
import bambi as bmb
import pytensor.tensor as pt
from scipy.special import expit, logit
import statsmodels.api as sm
# for plotting
import seaborn as sns
import matplotlib.pyplot as plt
import arviz as az
This is the notebook with the regression analyses for the paper on understanding and measuring disagreements in bird taxonomy. The notebook starts by cleaning and combining data. Those data are then used for the analysis. If you want to run the analysis yourself, you can download the datafile from zenodo, and use that instead of the data-cleaning cells at the start of the notebook (10.5281/zenodo.10078307, file TD_EffortStudy_data.csv).
The traces that were reported in the paper were also uploaded in the same dataset on Zenodo. If you don't want to run the models yourself, you can download them on zenodo in the .netcdf file format with the az.from_netcdf('...').
Note: the DAG used to build all the models in this notebook is described in the manuscript submitted to EJT.
# versions
print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))
re==2.2.1 pandas==1.5.3 numpy==1.24.2 pymc==5.1.2 bambi==0.10.0 statsmodels.api==0.13.5 seaborn==0.12.2 arviz==0.15.1
SEED = 201288
## plotting functions
# function for getting the probabilities
def pordlog(a):
# transform back to cumulative probabilites
pa = expit(a)
p_cum = np.concatenate(([0.], pa, [1.]))
# get the intervals instead of the cumulative probs
return p_cum[1:] - p_cum[:-1]
# function for plotting the posterior predictive proportions of ordinal scores
def ordinal_plot(nscores, df, title, labels,axis):
colors = ['r','b','grey','g','c']
for i in range(nscores):
sns.scatterplot(x=new_hits,
y=df.iloc[i],
alpha = 0.5,
color = colors[i],
label = labels[i],
ax=ax[axis])
ax[axis].legend()
ax[axis].set_title(title)
# function for creating a stackplot of the proportions
def plot_probabilities2(ncuts, trace,cuts,name_lin,title,labels,axis,ylim,xrange):
logit = cuts - trace.predictions[name_lin]
logit_mean = logit.mean(dim = ['chain','draw'])
probabilities = np.array([pordlog(logit_mean[:,i].values) for i in range(len(xrange))])
ax[axis].stackplot(xrange, probabilities.T)
ax[axis].set_title(title, fontsize = 17)
ax[axis].legend(labels,loc = 'lower right')
ax[axis].set_ylim(ymin = ylim, ymax = 1)
ax[axis].set_ylabel('Proportion of total concepts', fontsize = 14)
# disagreement data
conflicts = ['dc_recoded','scdr']
c_dct = {'class_bin':'Taxonomic conflict','dc_bin':'Concept conflict','scdr_bin':'Rank conflict', 'classif_recoded':'Taxonomic conflict','dc_recoded':'Concept conflict', 'scdr':'Rank conflict'}
df = pd.read_csv('...', sep = ";")
df.drop(['Unnamed: 0'], axis=1, inplace=True)
# rename conflict to more meaningful names
df.rename(columns = {'dc_recoded':'concept_ord','scdr':'rank_ord'}, inplace=True)
conflicts = ['concept_ord','rank_ord']
# create df of only BL species
dfl = df.loc[df.in_bl == 1].copy()
print(df.shape)
print(dfl.shape)
dfl.head()
(12730, 108) (10999, 108)
avibase_id | birdlife_id | tsn | eol_page_id | iucn_name | in_bl | genus_bl | name_bl | family_bl | sub_bl | ... | genus_conflict | ep_conflict | nom_conflict | nom_agr | nom_all_dif | nom_bin | ep_bin | gen_bin | concept_ord | classif_recoded | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 95A724BEBE54EED7 | Euplectes diadematus | 560040.0 | 45515027.0 | 22719168.0 | 1.0 | Euplectes | diadematus | NaN | 0.0 | ... | 0 | 0 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | D4162FE44AF93BB6 | Centropus bernsteini | 554754.0 | 45517807.0 | 22684215.0 | 1.0 | Centropus | bernsteini | Cuculidae | 0.0 | ... | 0 | 0 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 96819591FD94B905 | Phylloscopus fuscatus | 179846.0 | 45510049.0 | 22715264.0 | 1.0 | Phylloscopus | fuscatus | Phylloscopidae | 0.0 | ... | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 1 | 1 |
1734 | CFFF429637FFF836 | Oreotrochilus stolzmanni | 693766.0 | 1268587.0 | NaN | 1.0 | Oreotrochilus | stolzmanni | Trochilidae | 0.0 | ... | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
1735 | 1C2F4E80D146FA22 | Oxyura maccoa | 175180.0 | 1048502.0 | 22679820.0 | 1.0 | Oxyura | maccoa | NaN | 0.0 | ... | 0 | 0 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 108 columns
# load the google and wos data
effort_df = pd.read_csv('...')
effort_df.drop(['Unnamed: 0'], axis=1, inplace=True)
effort_df.head()
avibase_id | tot_conflict | tax_conflict | nom_conflict | hits1 | hits2 | hits3 | hits4 | hits5 | wos_hits | genus_bl | family_bl_all | order_bl_all | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 95A724BEBE54EED7 | 0.0 | 0.0 | 0.0 | 2710 | 2470 | 2940 | 2460 | 2570 | 1.0 | Euplectes | Ploceidae | passeriformes |
1 | D4162FE44AF93BB6 | 0.0 | 0.0 | 0.0 | 2470 | 2780 | 2490 | 3040 | 3530 | 0.0 | Centropus | Cuculidae | cuculiformes |
2 | 96819591FD94B905 | 3.0 | 3.0 | 0.0 | 40400 | 39000 | 39000 | 43600 | 39900 | 38.0 | Phylloscopus | Phylloscopidae | passeriformes |
3 | CFFF429637FFF836 | 3.0 | 3.0 | 0.0 | 2340 | 2180 | 2100 | 2230 | 1990 | 1.0 | Oreotrochilus | Trochilidae | caprimulgiformes |
4 | 1C2F4E80D146FA22 | 0.0 | 0.0 | 0.0 | 12300 | 12300 | 12700 | 12400 | 12400 | 3.0 | Oxyura | Anatidae | anseriformes |
# load the gbif data
with open('...', "rb") as input_file:
gbif = pickle.load(input_file)
gbif_df = pd.DataFrame(gbif, index = ['gbif']).T
gbif_df.reset_index(inplace=True)
gbif_df.columns = ['birdlife_id','gbif']
gbif_df['birdlife_id'] = [i.capitalize() for i in gbif_df.birdlife_id.values]
gbif_df.to_csv('...')
NOTE: The file created in the cell below is the datafile that has been uploaded to zenodo (TD_EffortStudy_data.csv). If you want to repeat the analysis, just use that file.
## make dataframe with all relevant columns
# select cols
dfcols = ['avibase_id','birdlife_id'] + conflicts
effortcols = ['avibase_id','hits1','hits2', 'hits3', 'hits4', 'hits5', 'wos_hits']
# merge with google and wos
dfx = dfl[dfcols].merge(effort_df[effortcols], on = 'avibase_id',how = 'left')
# merge with gbif occurences
dfx = dfx.merge(gbif_df, on = 'birdlife_id',how = 'left')
## create columns needed for analysis
# gbif occurences: model zeros separately, and take log10 of the nonzero values
dfx['gbif_zero'] = np.where(dfx.gbif == 0, 1, 0)
dfx['gbif_lognoneg'] = [np.log10(i) if i > 0 else i for i in dfx.gbif.values]
# # idem for wos
dfx['wos_zero'] = np.where(dfx.wos_hits == 0, 1, 0)
dfx['wos_lognoneg'] = [np.log10(i) if i > 0 else i for i in dfx.wos_hits.values]
# # take mean of the google hits, and then log10
dfx['google_mean'] = dfx[['hits1','hits2', 'hits3', 'hits4', 'hits5']].mean(axis = 1)
dfx['loggoogle'] = np.log10(dfx.google_mean)
dfx.to_csv('...')
dfx.head()
avibase_id | birdlife_id | concept_ord | rank_ord | hits1 | hits2 | hits3 | hits4 | hits5 | wos_hits | gbif | gbif_zero | gbif_lognoneg | wos_zero | wos_lognoneg | google_mean | loggoogle | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 95A724BEBE54EED7 | Euplectes diadematus | 0 | 0 | 2710 | 2470 | 2940 | 2460 | 2570 | 1.0 | 333.0 | 0 | 2.522444 | 0 | 0.000000 | 2630.0 | 3.419956 |
1 | D4162FE44AF93BB6 | Centropus bernsteini | 0 | 0 | 2470 | 2780 | 2490 | 3040 | 3530 | 0.0 | 746.0 | 0 | 2.872739 | 1 | 0.000000 | 2862.0 | 3.456670 |
2 | 96819591FD94B905 | Phylloscopus fuscatus | 1 | 0 | 40400 | 39000 | 39000 | 43600 | 39900 | 38.0 | 87984.0 | 0 | 4.944404 | 0 | 1.579784 | 40380.0 | 4.606166 |
3 | CFFF429637FFF836 | Oreotrochilus stolzmanni | 0 | 3 | 2340 | 2180 | 2100 | 2230 | 1990 | 1.0 | 636.0 | 0 | 2.803457 | 0 | 0.000000 | 2168.0 | 3.336059 |
4 | 1C2F4E80D146FA22 | Oxyura maccoa | 0 | 0 | 12300 | 12300 | 12700 | 12400 | 12400 | 3.0 | 15453.0 | 0 | 4.189013 | 0 | 0.477121 | 12420.0 | 4.094122 |
# check and remove nans
withnans = len(dfx)
dfx = dfx.dropna()
print(f'dropped {withnans - len(dfx)} concepts with missing data')
dropped 4 concepts with missing data
# correlations
dfx[['loggoogle','wos_lognoneg','gbif_lognoneg']].corr().style.background_gradient(cmap='coolwarm')
loggoogle | wos_lognoneg | gbif_lognoneg | |
---|---|---|---|
loggoogle | 1.000000 | 0.756405 | 0.726334 |
wos_lognoneg | 0.756405 | 1.000000 | 0.570808 |
gbif_lognoneg | 0.726334 | 0.570808 | 1.000000 |
All seems to work fine when we simulate this
n = 1000
prob0 = 0.1
x = np.random.normal(0,1,int(n*1-prob0))
x0 = np.zeros(int(n*prob0))
xdata = np.concatenate([x,x0])
zerodata = np.concatenate([np.zeros(int(n*1-prob0)), np.ones(int(n*prob0))])
true_counts = -0.8
true_zeros = 1.5
c1 = 2
c2 = 3
c3 = 3.3
logodds1 = c1 - (true_counts*xdata + true_zeros*zerodata)
logodds2 = c2 - (true_counts*xdata + true_zeros*zerodata)
logodds3 = c3 - (true_counts*xdata + true_zeros*zerodata)
prob1 = expit(logodds1)
prob2 = expit(logodds2)
prob3 = expit(logodds3)
p1 = prob1
p2 = prob2 - prob1
p3 = prob3 - prob2
p4 = 1 - prob3
obsprobs = np.array([p1,p2,p3,p4]).T
y = [np.random.choice(4, 1, p = list(i))[0] for i in obsprobs]
df = pd.DataFrame([xdata,zerodata,y, obsprobs[:,0],obsprobs[:,1],obsprobs[:,2],obsprobs[:,3]], index = ['x','zeros','y','p1','p2','p3','p4']).T
df.head()
x | zeros | y | p1 | p2 | p3 | p4 | |
---|---|---|---|---|---|---|---|
0 | 1.482865 | 0.0 | 0.0 | 0.960315 | 0.024710 | 0.003838 | 0.011137 |
1 | -0.378768 | 0.0 | 0.0 | 0.845141 | 0.091708 | 0.015589 | 0.047562 |
2 | -0.978175 | 0.0 | 3.0 | 0.771616 | 0.130190 | 0.023550 | 0.074643 |
3 | 0.390296 | 0.0 | 0.0 | 0.909885 | 0.054961 | 0.008872 | 0.026282 |
4 | 0.309677 | 0.0 | 0.0 | 0.904456 | 0.058136 | 0.009424 | 0.027984 |
with pm.Model() as simul_ordinal:
# data
data = pm.MutableData('data', xdata)
zerodatas = pm.MutableData('zerodatas', zerodata)
cutpoints = pm.Normal('cutpoints',
mu=[0,1,2],
sigma=5,
transform=pm.distributions.transforms.univariate_ordered,
shape = 3,
)
counts = pm.Normal('counts',0,5)
zeros = pm.Normal('zeros',0,5)
p = pm.Deterministic('p',counts * data + zeros * zerodatas)
y_out = pm.OrderedLogistic("y_out", p, cutpoints, observed=y,compute_p = True)
trace = pm.sample()
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [cutpoints, counts, zeros]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 98 seconds.
az.summary(trace, var_names = ['zeros','counts','cutpoints'])
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
zeros | 1.463 | 0.222 | 1.067 | 1.901 | 0.004 | 0.003 | 2941.0 | 3078.0 | 1.0 |
counts | -0.761 | 0.103 | -0.954 | -0.568 | 0.002 | 0.001 | 3291.0 | 3127.0 | 1.0 |
cutpoints[0] | 1.963 | 0.102 | 1.768 | 2.152 | 0.002 | 0.001 | 2647.0 | 2822.0 | 1.0 |
cutpoints[1] | 2.991 | 0.138 | 2.740 | 3.264 | 0.003 | 0.002 | 3034.0 | 3040.0 | 1.0 |
cutpoints[2] | 3.270 | 0.151 | 2.982 | 3.549 | 0.003 | 0.002 | 3118.0 | 2974.0 | 1.0 |
The technique of separately modeling the zeros is discussed in (Hosmer & Lemeshow's[https://www.amazon.com/dp/0471356328]) book on logistic regression.
plt.hist(dfx.wos_lognoneg, bins = 30)
plt.show()
names = ['rank','concept']
coords = {'conflicts':names}
with pm.Model(coords=coords) as wos_m_ordinal:
wos_data = pm.MutableData('wos_data', dfx.wos_lognoneg.values)
wos_zeros = pm.MutableData('wos_zeros', dfx.wos_zero.values)
counts = pm.Normal('counts',0,1, dims = 'conflicts')
zeros = pm.Normal('zeros',0,1,dims = 'conflicts')
# prior for the cutpoints
cutpoints_rank = pm.Normal('cutpoints_rank',
mu=[0,1,2,3],
sigma=1,
transform=pm.distributions.transforms.univariate_ordered,
shape = 4,
)
cutpoints_concept = pm.Normal('cutpoints_concept',
mu=[0,1],
sigma=1,
transform=pm.distributions.transforms.univariate_ordered,
shape = 2,
)
p_rank = pm.Deterministic('p_rank',counts[0] * wos_data + zeros[0] * wos_zeros)
p_concept = pm.Deterministic('p_concept',counts[1] * wos_data + zeros[1] * wos_zeros)
y_rank = pm.OrderedLogistic("y_rank", p_rank, cutpoints_rank, observed=dfx.rank_ord,compute_p = False)
y_concept = pm.OrderedLogistic("y_concept", p_concept, cutpoints_concept, observed=dfx.concept_ord,compute_p = False)
pr = pm.sample_prior_predictive()
Sampling: [counts, cutpoints_concept, cutpoints_rank, y_concept, y_rank, zeros]
with wos_m_ordinal:
trace_wos_ordinal = pm.sample(5000, return_inferencedata = True)
Auto-assigning NUTS sampler... Initializing NUTS using jitter+adapt_diag... Multiprocess sampling (4 chains in 4 jobs) NUTS: [counts, zeros, cutpoints_rank, cutpoints_concept]
Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 900 seconds.
az.summary(trace_wos_ordinal, var_names = ['counts','zeros','cutpoints_concept','cutpoints_rank'])
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
counts[rank] | -0.375 | 0.068 | -0.503 | -0.248 | 0.001 | 0.000 | 12736.0 | 15484.0 | 1.0 |
counts[concept] | 0.321 | 0.046 | 0.235 | 0.409 | 0.000 | 0.000 | 13972.0 | 15152.0 | 1.0 |
zeros[rank] | 0.632 | 0.080 | 0.486 | 0.786 | 0.001 | 0.001 | 12287.0 | 14311.0 | 1.0 |
zeros[concept] | 0.016 | 0.087 | -0.152 | 0.175 | 0.001 | 0.001 | 15476.0 | 15422.0 | 1.0 |
cutpoints_concept[0] | 2.397 | 0.060 | 2.284 | 2.509 | 0.001 | 0.000 | 13510.0 | 15056.0 | 1.0 |
cutpoints_concept[1] | 4.086 | 0.084 | 3.930 | 4.246 | 0.001 | 0.000 | 18292.0 | 15989.0 | 1.0 |
cutpoints_rank[0] | 2.181 | 0.066 | 2.057 | 2.306 | 0.001 | 0.000 | 11180.0 | 13530.0 | 1.0 |
cutpoints_rank[1] | 2.193 | 0.066 | 2.068 | 2.317 | 0.001 | 0.000 | 11211.0 | 13826.0 | 1.0 |
cutpoints_rank[2] | 2.229 | 0.067 | 2.103 | 2.354 | 0.001 | 0.000 | 11229.0 | 13724.0 | 1.0 |
cutpoints_rank[3] | 4.383 | 0.103 | 4.192 | 4.579 | 0.001 | 0.001 | 16063.0 | 14072.0 | 1.0 |
az.to_netcdf(trace_wos_ordinal, '...')
fig, ax = plt.subplots(figsize = (10,4))
az.plot_forest(trace_wos_ordinal , var_names = ['counts','zeros'],combined = True, ax=ax)
array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)
# posterior predictive checks: outcomes
## create new data that increases gradually from 0 to 4 (because log10 scale), and that has both 0s and 1s for the zeros
thinned_trace = trace_wos_ordinal.sel(draw=slice(None, None, 5))
n = thinned_trace.posterior['p_rank'].shape[-1]
new_hits = np.repeat(np.linspace(0,4,128),int(n/128) + 1)[:n]
new_zeros = np.concatenate([np.ones(int(n/128), dtype = 'int32'), np.zeros(n -int(n/128), dtype = 'int32')])
# now sample from the model with this hypothetical data
with wos_m_ordinal:
pm.set_data({"wos_data":new_hits, "wos_zeros" :new_zeros})
ppc_rank = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_rank'], predictions = True)
with wos_m_ordinal:
pm.set_data({"wos_data":new_hits, "wos_zeros" :new_zeros})
ppc_concept = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_concept'], predictions = True)
Sampling: [y_rank]
Sampling: [y_concept]
# posterior predictive checks: outcomes
new_hits2_w = np.linspace(0,4,100)
new_zeros2_w = np.concatenate([[1], np.zeros(99, dtype = 'int32')])
with wos_m_ordinal:
pm.set_data({"wos_data":new_hits2_w, "wos_zeros" :new_zeros2_w})
thinned_trace = trace_wos_ordinal.sel(draw=slice(None, None, 10))
ppc_rank2_wos = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['p_rank'], predictions = True)
with wos_m_ordinal:
pm.set_data({"wos_data":new_hits2_w, "wos_zeros" :new_zeros2_w})
thinned_trace = trace_wos_ordinal.sel(draw=slice(None, None, 10))
ppc_concept2_wos = pm.sample_posterior_predictive(trace_wos_ordinal, progressbar = True, var_names = ['p_concept'], predictions = True)
Sampling: []
Sampling: []
fig, ax = plt.subplots(ncols = 2,figsize = (10,4))
cuts_concept_wos = trace_wos_ordinal.posterior['cutpoints_concept']
cuts_rank_wos = trace_wos_ordinal.posterior['cutpoints_rank']
plot_probabilities2(len(dfx.concept_ord.unique()),
ppc_concept2_wos,
cuts_concept_wos,'p_concept','Concept conflict',
['No conflict','3 vs. 1','2 vs 2 (or 2 vs 1 vs 1)'],
0,0.7,new_hits2_w)
cuts_rank = trace_wos_ordinal.posterior['cutpoints_rank']
plot_probabilities2(len(dfx.rank_ord.unique()),
ppc_rank2_wos,
cuts_rank_wos,'p_rank','Rank conflict',
['No conflict','1 lists disagrees','2 lists disagree','3 lists disagree','all disagree'],
1,0.7,new_hits2_w)
# put the predictive samples in a dataframe with a row for each ordinal score and each draw as a column
# (with each cell being the count of that ordinal score for that draw)
# then divide by the total samples for that draw to get the proportion of that score,
concept_outcome_samples = pd.DataFrame(az.extract(ppc_concept.predictions['y_concept'])['y_concept'].values).T.apply(pd.Series.value_counts) / 800
rank_outcome_samples = pd.DataFrame(az.extract(ppc_rank.predictions['y_rank'])['y_rank'].values).T.apply(pd.Series.value_counts) / 800
# now plot these proportions by increasing wos hits
fig, ax = plt.subplots(ncols = 2, figsize = (10,4),sharey = True)
ordinal_plot(len(dfx.concept_ord.unique()), concept_outcome_samples, 'Concept conflict',
['No concept conflict','Conflict score 1', 'Conflict score 2'],0)
ordinal_plot(len(dfx.rank_ord.unique()), rank_outcome_samples, 'Rank conflict',
['No Rank conflict','Conflict score 1', 'Conflict score 2','Conflict score 3', 'Conflict score 4'],1)
plt.show()
plt.hist(dfx.gbif_lognoneg, bins = 30)
plt.show()
with pm.Model(coords=coords) as gbif_m_ordinal:
gbif_data = pm.MutableData('gbif_data', dfx.gbif_lognoneg.values)
gbif_zeros = pm.MutableData('gbif_zeros', dfx.gbif_zero.values)
counts = pm.Normal('counts',0,1, dims = 'conflicts')
zeros = pm.Normal('zeros',0,1,dims = 'conflicts')
# prior for the cutpoints
cutpoints_rank = pm.Normal('cutpoints_rank',
mu=[0,1,2,3],
sigma=1,
transform=pm.distributions.transforms.univariate_ordered,
shape = 4,
)
cutpoints_concept = pm.Normal('cutpoints_concept',
mu=[0,1],
sigma=1,
transform=pm.distributions.transforms.univariate_ordered,
shape = 2,
)
p_rank = pm.Deterministic('p_rank',counts[0] * gbif_data + zeros[0] * gbif_zeros)
p_concept = pm.Deterministic('p_concept',counts[1] * gbif_data + zeros[1] * gbif_zeros)
y_rank = pm.OrderedLogistic("y_rank", p_rank, cutpoints_rank, observed=dfx.rank_ord,compute_p = False)
y_concept = pm.OrderedLogistic("y_concept", p_concept, cutpoints_concept, observed=dfx.concept_ord,compute_p = False)
pr = pm.sample_prior_predictive()
Sampling: [counts, cutpoints_concept, cutpoints_rank, y_concept, y_rank, zeros]
with gbif_m_ordinal:
trace_gbif_ordinal = pm.sample(5000, return_inferencedata = True)
az.to_netcdf(trace_gbif_ordinal, '...')
az.summary(trace_gbif_ordinal, var_names = ['counts','zeros','cutpoints_rank','cutpoints_concept'])
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
counts[rank] | -0.688 | 0.028 | -0.742 | -0.637 | 0.000 | 0.000 | 17152.0 | 14309.0 | 1.0 |
counts[concept] | 0.078 | 0.025 | 0.032 | 0.125 | 0.000 | 0.000 | 15108.0 | 12877.0 | 1.0 |
zeros[rank] | 0.232 | 0.143 | -0.038 | 0.495 | 0.001 | 0.001 | 20608.0 | 16666.0 | 1.0 |
zeros[concept] | 0.137 | 0.232 | -0.306 | 0.571 | 0.002 | 0.001 | 22212.0 | 15791.0 | 1.0 |
cutpoints_rank[0] | 0.094 | 0.085 | -0.072 | 0.245 | 0.001 | 0.001 | 16693.0 | 13783.0 | 1.0 |
cutpoints_rank[1] | 0.107 | 0.085 | -0.056 | 0.261 | 0.001 | 0.001 | 16777.0 | 13825.0 | 1.0 |
cutpoints_rank[2] | 0.147 | 0.085 | -0.017 | 0.301 | 0.001 | 0.001 | 16917.0 | 14070.0 | 1.0 |
cutpoints_rank[3] | 2.441 | 0.112 | 2.236 | 2.654 | 0.001 | 0.001 | 19981.0 | 15381.0 | 1.0 |
cutpoints_concept[0] | 2.445 | 0.096 | 2.264 | 2.623 | 0.001 | 0.001 | 15189.0 | 13053.0 | 1.0 |
cutpoints_concept[1] | 4.128 | 0.112 | 3.917 | 4.338 | 0.001 | 0.001 | 17088.0 | 13524.0 | 1.0 |
fig, ax = plt.subplots(figsize = (10,4))
az.plot_forest(trace_gbif_ordinal , var_names = ['counts','zeros'],combined = True, ax=ax)
array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)
# posterior predictive checks: outcomes
## create new data that increases gradually from 0 to 4 (because log10 scale), and that has both 0s and 1s for the zeros
thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 5))
n = thinned_trace.posterior['p_rank'].shape[-1]
new_hits = np.repeat(np.linspace(0,4,128),int(n/128) + 1)[:n]
new_zeros = np.concatenate([np.ones(int(n/128), dtype = 'int32'), np.zeros(n -int(n/128), dtype = 'int32')])
with gbif_m_ordinal:
pm.set_data({"gbif_data":new_hits, "gbif_zeros" :new_zeros})
ppc_rank = pm.sample_posterior_predictive(trace_gbif_ordinal, progressbar = True, var_names = ['y_rank'], predictions = True)
with gbif_m_ordinal:
pm.set_data({"gbif_data":new_hits, "gbif_zeros" :new_zeros})
ppc_concept = pm.sample_posterior_predictive(trace_gbif_ordinal, progressbar = True, var_names = ['y_concept'], predictions = True)
Sampling: [y_rank]
Sampling: [y_concept]
# posterior predictive checks: outcomes
new_hits2_gb = np.linspace(0,4,100)
new_zeros2_gb = np.concatenate([[1], np.zeros(99, dtype = 'int32')])
with gbif_m_ordinal:
pm.set_data({"gbif_data":new_hits2_gb, "gbif_zeros" :new_zeros2_gb})
thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 10))
ppc_rank2_gbif = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['p_rank'], predictions = True)
with gbif_m_ordinal:
pm.set_data({"gbif_data":new_hits2_gb, "gbif_zeros" :new_zeros2_gb})
thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 10))
ppc_concept2_gbif = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['p_concept'], predictions = True)
Sampling: []
Sampling: []
# put the predictive samples in a dataframe with a row for each ordinal score and each draw as a column
# (with each cell being the count of that ordinal score for that draw)
# then divide by the total samples for that draw to get the proportion of that score,
concept_outcome_samples = pd.DataFrame(az.extract(ppc_concept.predictions['y_concept'])['y_concept'].values).T.apply(pd.Series.value_counts) / 400
rank_outcome_samples = pd.DataFrame(az.extract(ppc_rank.predictions['y_rank'])['y_rank'].values).T.apply(pd.Series.value_counts) / 400
# now plot these proportions by increasing wos hits
fig, ax = plt.subplots(ncols = 2, figsize = (10,4),sharey = True)
ordinal_plot(len(dfx.concept_ord.unique()), concept_outcome_samples, 'Concept conflict',
['No concept conflict','Conflict score 1', 'Conflict score 2'],0)
ordinal_plot(len(dfx.rank_ord.unique()), rank_outcome_samples, 'Rank conflict',
['No Rank conflict','Conflict score 1', 'Conflict score 2','Conflict score 3', 'Conflict score 4'],1)
plt.show()
fig, ax = plt.subplots(ncols = 2,figsize = (10,4))
cuts_concept_gbif = trace_gbif_ordinal.posterior['cutpoints_concept']
cuts_rank_gbif = trace_gbif_ordinal.posterior['cutpoints_rank']
plot_probabilities2(len(dfx.concept_ord.unique()),
ppc_concept2_gbif,
cuts_concept_gbif,'p_concept','Concept conflict',
['No conflict','1 lists disagrees','2 lists disagree'],
0,0.5,new_hits2_gb)
plot_probabilities2(len(dfx.rank_ord.unique()),
ppc_rank2_gbif,
cuts_rank_gbif,'p_rank','Rank conflict',
['No conflict','1 lists disagrees','2 lists disagree','3 lists disagree','all disagree'],
1,0.3,new_hits2_gb)
plt.hist(np.log10(dfx.google_mean), bins = 30)
plt.show()
with pm.Model(coords=coords) as google_ordinal:
# data
google_data = pm.MutableData('google_data', dfx.loggoogle.values)
# prior for the cutpoints
cutpoints_rank = pm.Normal('cutpoints_rank',
mu=[0,1,2,3],
sigma=1,
transform=pm.distributions.transforms.univariate_ordered,
shape = 4,
)
cutpoints_concept = pm.Normal('cutpoints_concept',
mu=[0,1],
sigma=1,
transform=pm.distributions.transforms.univariate_ordered,
shape = 2,
)
googlecounts = pm.Normal('googlecounts',0,1, dims = 'conflicts')
p_rank = pm.Deterministic('p_rank',googlecounts[0] * google_data )
p_concept = pm.Deterministic('p_concept',googlecounts[1] * google_data)
# don't compute and store in the trace the inferred probabilities of each categories, based on the cutpoints’ values
# it takes too much memory; set compute_p to False
y_rank = pm.OrderedLogistic("y_rank", p_rank, cutpoints_rank, observed=dfx.rank_ord,compute_p = False)
y_concept = pm.OrderedLogistic("y_concept", p_concept, cutpoints_concept, observed=dfx.concept_ord,compute_p = False)
with google_ordinal:
trace_google_ordinal = pm.sample(5000)
az.to_netcdf(trace_google_ordinal, '...')
az.summary(trace_google_ordinal, var_names = ['googlecounts','cutpoints_rank','cutpoints_concept'])
mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
googlecounts[rank] | -1.767 | 0.053 | -1.869 | -1.671 | 0.000 | 0.000 | 14747.0 | 13723.0 | 1.0 |
googlecounts[concept] | 0.168 | 0.048 | 0.079 | 0.259 | 0.000 | 0.000 | 17223.0 | 12776.0 | 1.0 |
cutpoints_rank[0] | -4.232 | 0.184 | -4.588 | -3.901 | 0.002 | 0.001 | 14711.0 | 13560.0 | 1.0 |
cutpoints_rank[1] | -4.217 | 0.184 | -4.560 | -3.873 | 0.002 | 0.001 | 14698.0 | 13533.0 | 1.0 |
cutpoints_rank[2] | -4.175 | 0.184 | -4.529 | -3.845 | 0.002 | 0.001 | 14718.0 | 13399.0 | 1.0 |
cutpoints_rank[3] | -1.747 | 0.186 | -2.110 | -1.414 | 0.001 | 0.001 | 16830.0 | 14594.0 | 1.0 |
cutpoints_concept[0] | 2.818 | 0.191 | 2.456 | 3.173 | 0.001 | 0.001 | 17198.0 | 12879.0 | 1.0 |
cutpoints_concept[1] | 4.501 | 0.199 | 4.122 | 4.870 | 0.001 | 0.001 | 17859.0 | 13765.0 | 1.0 |
fig, ax = plt.subplots(figsize = (6,3))
az.plot_forest(trace_google_ordinal, var_names = 'googlecounts',combined = True,ax=ax)
array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)
# posterior predictive checks: outcomes
thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 5))
n = thinned_trace.posterior['p_rank'].shape[-1]
new_hits = np.repeat(np.linspace(0,4,128),int(n/128) + 1)[:n]
with google_ordinal:
pm.set_data({"google_data":new_hits})
ppc_rank = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_rank'], predictions = True)
with google_ordinal:
pm.set_data({"google_data":new_hits})
thinned_trace = trace_google_ordinal.sel(draw=slice(None, None, 10))
ppc_concept = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_concept'], predictions = True)
Sampling: [googlecounts, y_rank]
Sampling: [y_concept]
# posterior predictive checks: outcomes
new_hits2_g = np.linspace(2,6,100)
with google_ordinal:
pm.set_data({"google_data":new_hits2_g})
thinned_trace = trace_google_ordinal.sel(draw=slice(None, None, 5))
ppc_rank2_g = pm.sample_posterior_predictive(trace_google_ordinal, progressbar = True, var_names = ['p_rank'], predictions = True)
with google_ordinal:
pm.set_data({"google_data":new_hits2_g})
thinned_trace = trace_google_ordinal.sel(draw=slice(None, None, 5))
ppc_concept2_g = pm.sample_posterior_predictive(trace_google_ordinal, progressbar = True, var_names = ['p_concept'], predictions = True)
Sampling: []
Sampling: []
# put the predictive samples in a dataframe with a row for each ordinal score and each draw as a column
# (with each cell being the count of that ordinal score for that draw)
# then divide by the total samples for that draw to get the proportion of that score,
concept_outcome_samples = pd.DataFrame(az.extract(ppc_concept.predictions['y_concept'])['y_concept'].values).T.apply(pd.Series.value_counts) / 400
rank_outcome_samples = pd.DataFrame(az.extract(ppc_rank.predictions['y_rank'])['y_rank'].values).T.apply(pd.Series.value_counts) / 400
# now plot these proportions by increasing wos hits
fig, ax = plt.subplots(ncols = 2, figsize = (10,4),sharey = True)
ordinal_plot(len(dfx.concept_ord.unique()), concept_outcome_samples, 'Concept conflict',
['No concept conflict','Conflict score 1', 'Conflict score 2'],0)
ordinal_plot(len(dfx.rank_ord.unique()), rank_outcome_samples, 'Rank conflict',
['No Rank conflict','Conflict score 1', 'Conflict score 2','Conflict score 3', 'Conflict score 4'],1)
plt.show()
fig, ax = plt.subplots(ncols = 2,figsize = (10,4))
cuts_concept_g = trace_google_ordinal.posterior['cutpoints_concept']
cuts_rank_g = trace_google_ordinal.posterior['cutpoints_rank']
plot_probabilities2(len(dfx.concept_ord.unique()),
ppc_concept2_g,
cuts_concept_g,'p_concept','Concept conflict',
['No conflict','1 vs 3','2 vs 2'],
0,0.5,new_hits2_g)
plot_probabilities2(len(dfx.rank_ord.unique()),
ppc_rank2_g,
cuts_rank_g,'p_rank','Rank conflict',
['No conflict','1 rank case','2 rank cases','3 rank cases','4 rank cases'],
1,0.2,new_hits2_g)
# rank
fig, ax = plt.subplots(ncols = 3,figsize = (14,4), sharey = True,sharex = False)
plot_probabilities2(len(dfx.rank_ord.unique()),
ppc_rank2_wos,
cuts_rank_wos,'p_rank','WoS hits',
[],
0,0.2,new_hits2_w)
plot_probabilities2(len(dfx.rank_ord.unique()),
ppc_rank2_gbif,
cuts_rank_gbif,'p_rank','GBIF occurences',
['No conflict','1 rank case','2 rank cases','3 rank cases','4 rank cases'],
1,0.2,new_hits2_gb)
plot_probabilities2(len(dfx.rank_ord.unique()),
ppc_rank2_g,
cuts_rank_g,'p_rank','Google hits',
[],
2,0.2,new_hits2_g)
ax[0].set_xlabel('Log10 WoS hits', fontsize = 14)
ax[1].set_xlabel('Log10 GBIF occurences', fontsize = 14)
ax[2].set_xlabel('Log10 Google hits', fontsize = 14)
fig.tight_layout()
ax[1].set_ylabel(' ')
ax[2].set_ylabel(' ')
# plt.suptitle('Rank conflict by measure of effort', fontsize = 15, y=1.05)
fig.savefig('...', format='tif', dpi=600)
plt.show()
# concept
fig, ax = plt.subplots(ncols = 3,figsize = (14,4), sharey = True)
plot_probabilities2(len(dfx.concept_ord.unique()),
ppc_concept2_wos,
cuts_concept_wos,'p_concept','WoS hits',
[],
0,0.6,new_hits2_w)
plot_probabilities2(len(dfx.concept_ord.unique()),
ppc_concept2_gbif,
cuts_concept_gbif,'p_concept','GBIF occurences',
['No conflict','3 vs. 1','2 vs 2 (or 2 vs 1 vs 1)'],
1,0.6,new_hits2_gb)
plot_probabilities2(len(dfx.concept_ord.unique()),
ppc_concept2_g,
cuts_concept_g,'p_concept','Google hits',
[],
2,0.6,new_hits2_g)
ax[0].set_xlabel('Log10 WoS hits', fontsize = 14)
ax[1].set_xlabel('Log10 GBIF occurences', fontsize = 14)
ax[2].set_xlabel('Log10 Google hits', fontsize = 14)
fig.tight_layout()
ax[1].set_ylabel(' ')
ax[2].set_ylabel(' ')
# plt.suptitle('Concept conflict by measure of effort', fontsize = 15, y=1.05)
fig.savefig('...', format='tif', dpi=600)
plt.show()