Applied Univariate, Bivariate, and Multivariate Statistics Using Python: A Beginner's Guide to Advanced Data Analysis (python code used in the book)

Return to DATAPSYC.COM .

*Python Codes Used in the Book*
(sorted by section numbers, sections with no code underneath have no code in the book)

last updated or corrected August 3, 2022 @ 2:49PM MT. Please send corrections to daniel.denis@umontana.edu.

Chapter 1

1.1

1.2

1.3

1.4

1.5

1.6

1.6.1

1.7

1.8

1.9

1.10

Chapter 2

2.1

2.2

2.3

import numpy

x = numpy.array([0, 1, 2, 3, 4, 5])
numpy.sum(x)

import numpy as np
x = np.array([0, 1, 2, 3, 4, 5])
np.sum(x)

import numpy as numpywhichisquitefascinating
numpywhichisquitefascinating.sum(x)

x = np.array([1, 5, 8, 2, 7, 4])

x = np.array([1, 5, 8, 2, 7, 4])
x[4]

y = np.array([4, 6, 8, 2, 4, 1])

np.concatenate([x, y])

2.4

2.5

data = {‘ac’ : [70, 67, 65, 75, 76, 73, 69, 68, 70, 76, 77, 75, 85, 86, 85, 76, 75, 73, 95, 94, 89, 94, 93, 91],
‘teach’ : [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4],
‘text’ : [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2]}
import pandas as pd
df = pd.DataFrame(data)
df

import statistics
ac = df['ac']
z_numerator = (ac - statistics.mean(ac))
z_denominator = statistics.stdev(ac)
z = z_numerator/z_denominator
z

from scipy import stats
ac = df['ac']
stats.zscore(ac, axis=0, ddof=1)

import matplotlib.pyplot as plt
ac = df['ac']
z = stats.zscore(ac, axis=0, ddof=1)
ac_hist = plt.hist(ac)
ac.z_hist = plt.hist(z)

import scipy as sp
sp.stats.skew(ac)

sp.stats.kurtosis(ac)

2.6

data = {‘ac’ : [70, 67, 65, 75, 76, 73, 69, 68, 70, 76, 77, 75, 85, 86, 85, 76, 75, 73, 95, 94, 89, 94, 93, 91],
‘teach’ : [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4],
‘text’ : [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2]}
import pandas as pd
df = pd.DataFrame(data)
df

df[‘ac’]

ac = df[‘ac’]
mean = sum(ac)/len(ac)
mean

import statistics
mean = statistics.mean(ac)
mean

import numpy as np
np.mean(ac)

import math
math.sqrt(89.20)

sd = statistics.pstdev(ac)
sd

statistics.median(ac)

statistics.mode(ac)

2.7

import pandas as pd

iq_data = pd.read_csv(‘iq.data.txt’)
iq_data

dataset=pd.read_csv(“iq_data.txt”,delimiter=”\t”)
dataset

2.8

import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()

2.9

import numpy as np
np.random.randn(100)

2.10

import math
math.e
math.pi

2.11

cov_matrix = pd.DataFrame([(10, 20), (5, 15), (20, 12), (8, 17)],
columns=[‘var1’,‘var2’])

cov_matrix

cov_matrix.cov()
cov_matrix.corr()

2.11.1

import numpy as np
np.zeros((4, 4))

import numpy as np
np.empty((4, 4))

import numpy as np
np.ones((4, 2))

import numpy as np
A = np.matrix(‘1 2; 3 4’)
A

import numpy as np
B = np.array([[1, 2],
[3, 4]])
B

import numpy as np
np.transpose(B)

import numpy as np
np.matrix.trace(B)

import numpy as np
np.matrix.diagonal(B)

2.11.2

B = np.array([[1, 2], [3, 4]])
B

results = np.linalg.eig(B)
results

Chapter 3

3.1

3.2

import pandas as pd
data = pd.read_csv("population.csv")
data.head()

state = data['NAME']
state

pop_change_2019 = data['PPOPCHG_2019']
pop_change_2019

N_pop_change_2019 = data['NPOPCHG_2019']
N_pop_change_2019

import matplotlib.pyplot as plt
import numpy as np
plt.plot(data["NPOPCHG_2019"], data["PPOPCHG_2019"], "o")
plt.xlabel("NPOPCHG_2019")
plt.ylabel("PPOPCHG_2019")

3.3

3.4

x = [10, 15, 16, 23, 27, 38, 43, 56, 57, 60]
y = [5, 8, 9, 13, 16, 20, 40, 45, 67, 75]
import matplotlib.pyplot as plt
import numpy as np
plt.hist2d(x, y, bins=(50, 50), cmap=plt.cm.Reds)

3.5

import matplotlib.pyplot as plt
import seaborn as sns

df = sns.load_dataset('iris')
df.head()

sns.pairplot(df, kind="scatter", hue="species", markers=["o", "s", "D"], palette="Set2")
plt.show()

sns.pairplot(df, kind="scatter", hue="species", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()

3.6

import numpy as np
import matplotlib.pyplot as plt
height = [10, 20, 30, 75, 100]
bars = ('A', 'B', 'C', 'D', 'E')
y_pos = np.arange(len(bars))
plt.bar(y_pos, height, color=(0.2, 0.4, 0.4, 0.6))

3.7

import seaborn as sns
df = sns.load_dataset('iris')

sns.distplot( df["sepal_length"] , color="skyblue", label="Sepal Length")
sns.distplot( df["sepal_width"] , color="red", label="Sepal Width")
sns.plt.show()

3.8

x = np.random.rand(40)
y = np.random.rand(40)
z = np.random.rand(40)

plt.scatter(x, y, s=z*1000, alpha=0.5)
plt.show()

plt.scatter(x, y, s=z*10000, alpha=0.5)
plt.show()

3.9

import pandas as pd
df = pd.DataFrame([8,8,1,2], index=['a', 'b', 'c', 'd'], columns=['x'])

df.plot(kind='pie', subplots=True, figsize=(8, 8))

3.10

import seaborn as sns
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.random((5,5)),
columns=["a","b","c","d","e"])
df

map = sns.heatmap(df)

import seaborn as sns
df = sns.load_dataset('iris')

sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='hex')
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='scatter')
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='kde')

3.11

import matplotlib.pyplot as plt
import numpy as np

values=np.cumsum(np.random.randn(1000,1))

plt.plot(values)

3.12

Chapter 4

4.1

4.2

4.3

x = [0, 2, 6, 7, 15]
y = [0, 1, 8, 13, 20]
df = pd.DataFrame(x, y)
df

import scipy.stats
scipy.stats.pearsonr(x,y)

scipy.stats.spearmanr(x,y)

plt.scatter(x, y)

4.4

4.5

import numpy as np
np.random.seed(1)
x = np.random.randint(0, 50, 1000)
y = x + np.random.normal(0, 10, 1000)
np.corrcoef(x, y)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')
plt.scatter(x, y)

galton = pd.read_csv('Galton.csv')

galton.head()

from scipy import stats
pearson_coef, p_value = stats.pearsonr(galton["child"],
galton["parent"])
print("Pearson Correlation: ", pearson_coef, "and a P-value of:", p_value)

import seaborn as sns
sns.pairplot(galton)

parent = galton['parent']
child = galton['child']

columns = ['child', 'parent']
ax1 = galton.plot.scatter(x = 'child', y = 'parent')

4.6

iq = [105, 98, 110, 105, 95]
df = pd.DataFrame(iq)
df

from scipy import stats
stats.ttest_1samp(df, 100.0)

from scipy import stats
stats.ttest_ind(parent, child)

plt.figure(figsize=(10, 7))
sns.distplot(parent)

plt.figure(figsize=(9, 5))
sns.distplot(child)

sns.boxplot(parent)
sns.boxplot(child)

4.7

trial_1 = [10, 12.1, 9.2, 11.6, 8.3, 10.5]
trial_2 = [8.2, 11.2, 8.1, 10.5, 7.6, 9.5]
paired_data = pd.DataFrame(trial_1, trial_2)
paired_data

stats.ttest_rel(trial_1, trial_2)

4.8

stats.binom_test(2, n=5, p=0.5, alternative='greater')

4.9

from scipy.stats import chisquare
chisquare([16, 16, 16, 16, 16])

chisquare([16, 15, 16, 15, 16])

chisquare([16, 15, 10, 8, 25])

4.10

import numpy as np
matrix = np.array([[20, 10],
[5, 15]])
matrix

from scipy.stats import chi2_contingency
obs = np.array([[20, 10], [5, 15]])
chi2_contingency(obs)

Chapter 5

5.1

5.2

5.3

5.4

5.5

5.6

5.7

from statsmodels.stats.power import TTestIndPower
effect = 0.8
alpha = 0.05
power = 0.8
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None, ratio=1.0, alpha=alpha)
print('Sample Size: %.3f' % result)

from statsmodels.stats.power import TTestIndPower
effect = 0.8
alpha = 0.05
power = 0.9
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None, ratio=1.0, alpha=alpha)
print('Sample Size: %.3f' % result)

5.8

from statsmodels.stats.power import TTestIndPower
effect = 0.2
alpha = 0.05
power = 0.8
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None, ratio=1.0, alpha=alpha)
print('Sample Size: %.3f' % result)

5.9

from statsmodels.stats.power import TTestIndPower
effect = 0.8
alpha = 0.20
power = 0.8
analysis = TTestIndPower()
result = analysis.solve_power(effect, power=power, nobs1=None, ratio=1.0, alpha=alpha)
print('Sample Size: %.3f' % result)

5.10

5.11

Chapter 6

6.1

6.2

6.3

data = {'ac' : [70, 67, 65, 75, 76, 73, 69, 68, 70, 76, 77, 75, 85, 86, 85, 76, 75, 73, 95, 94, 89, 94, 93, 91],
'teach' : [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4]}
df = pd.DataFrame(data)
df

df['ac'].hist(by=df['teach'])

df.sum()

df.mean()

df.std()

df.median()

df.describe()

6.4

import scipy as sp
ac = df['ac']
sp.stats.shapiro(ac)

import scipy as sp
teach = df['teach']
sp.stats.levene(ac, teach)

6.5

df.info()

import statsmodels.api as sm
from statsmodels.formula.api import ols
model = ols('ac ~ C(teach)', data=df).fit()

table = sm.stats.anova_lm(model, typ=2)
print(table)

6.6

6.7

6.8

from statsmodels.stats.multicomp import (pairwise_tukeyhsd,MultiComparison)
post = pairwise_tukeyhsd(df['ac'], df['teach'])
print(post)

6.9

data = {'ac' : [70, 67, 65, 75, 76, 73, 69, 68, 70, 76, 77, 75, 85, 86, 85, 76, 75, 73, 95, 94, 89, 94, 93, 91],
'teach' : [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4],
'text' : [1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2]}
df_2 = pd.DataFrame(data)
df_2

import statsmodels as sm
import statsmodels.api as sm
model_2 = ols('ac ~ C(teach) + C(text)', data = df_2).fit()
table_2 = sm.stats.anova_lm(model_2, typ = 2)
print(table_2)

6.10

6.11

6.12

import statsmodels.api as sm
from statsmodels.formula.api import ols
model_3 = ols('ac ~ C(teach) + C(text) + C(teach)*C(text)', data = df_2).fit()
table_3 = sm.stats.anova_lm(model_3, typ = 2)
print(table_3)

6.13

import statsmodels.api as sm
res = model_3.resid
fig = sm.qqplot(res, line='s')

6.14

rat = pd.DataFrame({'rat': np.repeat([1, 2, 3, 4, 5, 6], 3),
'trial': np.tile([1, 2, 3], 6),
'time': [10.0, 8.2, 5.3,
         12.1, 11.2, 9.1,
         9.2, 8.1, 4.6,
         11.6, 10.5, 8.1,
         8.3, 7.6, 5.5,
         10.5, 9.5, 8.1]})
rat

from statsmodels.stats.anova import AnovaRM
print(AnovaRM(data = rat, depvar = 'time', subject = 'rat', within = ['trial']).fit())

6.15

6.15.1

6.15.2

6.15.3

6.15.4

data = {'grade':[0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
'studytime': [30, 25, 59, 42, 31, 140, 90, 95, 170, 120]}
df = pd.DataFrame(data)
df

grade = df['grade']
studytime = df['studytime']

import scipy
from scipy.stats import mannwhitneyu

sample1 = 30, 25, 59, 42, 31
sample2 = 140, 90, 95, 170, 120
stat, p = mannwhitneyu(sample1, sample2)
stat, p

6.15.5

x = [70, 67, 65, 75, 76, 73]
y = [69, 68, 70, 76, 77, 75]
z = [85, 86, 85, 76, 75, 73]
w = [95, 94, 89, 94, 93, 91]

from scipy import stats
stats.kruskal(x, y, z, w)

stats.kruskal(x, y)

pip install scikit-posthocs
import scikit_posthocs as sp

v = [[70, 67, 65, 75, 76, 73], [69, 68, 70, 76, 77, 75], [85, 86, 85, 76, 75, 73], [95, 94, 89, 94, 93, 91]]
sp.posthoc_nemenyi(v)

Chapter 7

7.1

7.2

7.3

7.4

7.5

7.6

7.7

7.8

7.9

pip install pyreadstat
import pyreadstat
df, meta = pyreadstat.read_sav("iq_data.sav")
df

y = df["verbal"]
x = df["quant"]
import numpy as np
import matplotlib.pyplot as plt
plt.scatter(x, y)

plt.title("Scatterplot of Verbal on Quant")
plt.xlabel("Quant”)
plt.ylabel("Verbal")

import statsmodels.api as sm
x = sm.add_constant(x)
x

model = sm.OLS(y, x).fit()
print_model = model.summary()
print(print_model)

7.10

7.11

7.12

7.13

import pyreadstat
df, meta = pyreadstat.read_sav("iq_data.sav")
df

import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm

X = df[['quant', 'analytic']]
Y = df['verbal']

regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)

print('Coefficients: \n', regr.coef_)

new_quant = 20
new_analytic = 25
print('predicted verbal: \n', regr.predict([[new_quant, new_analytic]]))
predicted verbal:

***TO GET P-VALUES, USE THE FOLLOWING CODE (NOT FOUND IN BOOK):

import statsmodels.api as sm

X = df[['quant', 'analytic']]

Y = df['verbal']

X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

predictions = model.predict(X)

print_model = model.summary()

print(print_model)

7.14

7.15

7.16

7.17

Chapter 8

8.1

8.2

8.3

odds = 0.5/(1-0.5)
odds

import numpy as np
np.log(odds)

odds = 1.0/(1.0-1.0)

odds = 0.0/(1.0-0.0)
odds

2**3

np.log(2)

np.exp(0.6931471805599453)

8.4

import pandas as pd
data = {'oring' : [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
'temp' : [53, 57, 58, 63, 66, 67, 67, 67, 68, 69, 70, 70, 70, 70, 72, 73, 75, 75, 76, 76, 78, 79, 81]}
df_challenger = pd.DataFrame(data)
print(df_challenger)

df_challenger['oring'].value_counts()

import seaborn as sns
sns.countplot(x='oring', data = df_challenger, palette='hls')

import statsmodels.api as sm

y = df_challenger['oring']
X = df_challenger['temp']
X_const = sm.add_constant(X)

model = sm.Logit(y, X_const)
results = model.fit()

print(results.summary())

predicted_logit = 15.0429 - 0.2322*X

predicted_logit = 15.0429 – 0.2322(X)
                = 15.0429 – 0.2322(53)
                = 2.7363
predicted_logit

from scipy import stats
stats.spearmanr(predicted_logit, X)

import matplotlib.pyplot as plt
import numpy as np
plt.plot(predicted_logit, X)
plt.xlabel("predicted logit")
plt.ylabel("temp")
plt.title("temp as a function of logit")

8.5

import pandas as pd
import numpy as np
import statsmodels.api as sm

df = pd.read_csv('Smarket.csv', index_col=0, parse_dates=True)
df.head()

import statsmodels.formula.api as smf
Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume

change = np.where(df['Direction']=='Up', 1, 0)
model_constant = sm.add_constant(df[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']])
model = sm.GLM(change, model_constant, family=sm.families.Binomial()).fit()
print(model.summary())

import numpy as np
np.exp(-0.0731)

prob = np.exp(-0.0731)/(1 + np.exp(-0.0731))
prob

8.5.1

model_constant = sm.add_constant(df[['Lag1']])
model = sm.GLM(change, model_constant, family=sm.families.Binomial()).fit()
print(model.summary())

8.6

Chapter 9

9.1

9.2

9.3

9.4

9.4.1

9.4.2

9.4.3

9.4.4

9.5

9.6

import pandas as pd
data = {'quant' : [5, 2, 6, 9, 8, 7, 9, 10, 10],
'verbal' : [2, 1, 3, 7, 9, 8, 8, 10, 9],
'train' : [1, 1, 1, 2, 2, 2, 3, 3, 3]}

df_manova = pd.DataFrame(data)
print(df_manova)

print(df_manova.dtypes)

cols = ['train']
for col in cols:
    df_manova[col] = df_manova[col].astype('category')
print(df_manova.dtypes)

from statsmodels.multivariate.manova import MANOVA
maov = MANOVA.from_formula('quant + verbal ~ train', data = df_manova)
print(maov.mv_test())

9.7

9.8

9.9

9.10

data_discrim = {'y' : [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
'x1' : [4, 3, 3, 2, 2, 8, 7, 5, 3, 3],
'x2' : [2, 1, 2, 2, 5, 3, 4, 5, 4, 2]}

df_discrim = pd.DataFrame(data_discrim)
df_discrim

y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
X = np.array([[4, 2], [3, 1], [3, 2], [2, 2], [2, 5], [8, 3], [7, 4], [5, 5], [3, 4], [3, 2]])

import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(X, y)

lda = LinearDiscriminantAnalysis(n_components=1)
model = lda.fit(X, y)

scores = lda.transform(X)
scores

print(lda.scalings_)

y = -3.283 + 0.49739549(x1) + 0.43107609(x2)

y = -3.283 + 0.49739549(4) + 0.43107609(2)
y = -3.283 + 1.98958196 + 0.8621521
   = -0.43126586

m = np.dot(lda.means_ - lda.xbar_, lda.scalings_)
m

pred=model.predict(X)
pred

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(confusion_matrix(pred, y))

9.11

train = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3])
X = np.array([[5, 2], [2, 1], [6, 3], [9, 7], [8, 9], [7, 8], [9, 8], [10, 10], [10, 9]])
X

model = lda.fit(X, train)
model

print(lda.scalings_)

lda.transform(X).shape

lda.transform(X)

pred = model.predict(X)
pred

print(confusion_matrix(pred, train))

9.12

9.13

Chapter 10

10.1

x = [.00, .90, 1.80, 2.60, 3.30, 4.40, 5.20, 6.10, 6.50, 7.40]
y = [5.90, 5.40, 4.40, 4.60, 3.50, 3.70, 2.80, 2.80, 2.40, 1.50]
pca_data = pd.DataFrame(x, y)
pca_data

import statistics
statistics.variance(x)

statistics.variance(y)

total_variance = statistics.variance(x) + statistics.variance(y)
total_variance

10.2

data = np.array([x, y])
data

covMatrix = np.cov(data, bias = False)
covMatrix

import seaborn as sn
sn.heatmap(covMatrix, annot=True, fmt='g')

import numpy.linalg as la
eigen = la.eig(covMatrix)
eigen

eigenvalue, eigenvector = la.eig(covMatrix)
eigenvalue, eigenvector

10.3

import pandas as pd
data = {'x': [0, 0.9, 1.8, 2.6, 3.3, 4.4, 5.2, 6.1, 6.5, 7.4],
'y': [5.9, 5.4, 4.4, 4.6, 3.5, 3.7, 2.8, 2.8, 2.4, 1.5]}
df = pd.DataFrame(data,columns = ['x', 'y'])
corrMatrix = df.corr()
print(corrMatrix)

import numpy.linalg as la
eigen = la.eig(corrMatrix)
eigen

10.4

10.5

A = np.array([[5.9, 0.0], [5.4, 0.9], [4.4, 1.8], [4.6, 2.6], [3.5, 3.3], [3.7, 4.4], [2.8, 5.2], [2.8, 6.1], [2.4, 6.5], [1.5, 7.4]])
A

import matplotlib.pyplot as plt
plt.plot(x, y)

plt.plot(x, y, 'o', color='black')

from sklearn.decomposition import PCA
pca = PCA(2)
pca.fit(A)

print(pca.components_)

print(pca.explained_variance_)

from sklearn.decomposition import PCA
pca = PCA(1)
pca.fit(A)

print(pca.components_)

print(pca.explained_variance_)

10.6

10.7

10.8

data = pd.read_csv('usarrests.csv')
data.head(10)

df = pd.DataFrame(data, columns=['Murder', 'Assault', 'Urbanpop', 'Rape'])
df

df = pd.DataFrame(data, columns=['Murder', 'Assault', 'UrbanPop', 'Rape'])
df

import numpy as np
np.mean(df)

np.var(df)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)

scaled_data = scaler.transform(df)
scaled_data

scaled_data_df = pd.DataFrame(scaled_data, columns = df.columns)
scaled_data_df

np.mean(scaled_data_df)

np.var(scaled_data_df)

from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca.fit(scaled_data_df)

pca.components_

print(pca.explained_variance_ratio_)

from sklearn.decomposition import PCA
pca = PCA(n_components=4)
pca.fit(df)

pca.components_

print(pca.explained_variance_)

df.var()

df.std()

10.9

pip install pca
from pca import pca

pip install yellowbrick
import yellowbrick

model = pca(n_components=4)

X = scaled_data
results = model.fit_transform(X)

results

fig, ax = model.scatter()

fig, ax = model.biplot(n_feat=4)

visualizer = PCA(scale=True, proj_features=True)
visualizer.fit_transform(X)
visualizer.show()

fig, ax = model.plot()

Chapter 11

11.1

11.2

11.3

11.4

11.5

import pandas as pd
import numpy as np

pca_data = np.array([[0, 5.90], [.90, 5.40], [1.80, 4.40], [2.60, 4.60], [3.30, 3.50], [4.40, 3.70], [5.20, 2.80], [6.10, 2.80], [6.50, 2.40], [7.40, 1.50]])

pca_data

pip install factor_analyzer
import factor_analyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value,p_value=calculate_bartlett_sphericity(pca_data)
chi_square_value, p_value

from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(pca_data)

kmo_model

pip install FactorAnalyzer
from factor_analyzer import FactorAnalyzer
fa = FactorAnalyzer(rotation=None, method = ‘minres’, n_factors=1)

fa.fit(pca_data)

fa.loadings_

fa.get_communalities()

11.6

data = pd.read_csv('usarrests.csv')
data.head(10)

pip install factor_analyzer
import pandas as pd
from factor_analyzer import FactorAnalyzer

df = pd.DataFrame(data, columns=['Murder', 'Assault', 'UrbanPop', 'Rape'])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)

scaled_data = scaler.transform(df)
scaled_data

fa = FactorAnalyzer()
fa.set_params(n_factors=4, rotation=None)
fa.fit(scaled_data)

fa.loadings_

ev, v = fa.get_eigenvalues()
ev

fa.get_communalities()

(0.84370394)**2 + (-0.37474146)**2 + (-0.07321271)**2

1-0.85762759

fa_varimax = FactorAnalyzer(rotation='varimax')
fa_varimax.fit(scaled_data)

fa_varimax.loadings_

fa_varimax.get_communalities()

Chapter 12

12.1

12.2

12.3

12.4

12.5

from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns
iris = sns.load_dataset("iris")
print(iris.head())

df = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
df.head(3)

kmeans = KMeans(n_clusters=3)
kmeans.fit(df)
kmeans.predict(df)

centroids = kmeans.cluster_centers_
centroids

kpredict = kmeans.predict(df)
plt.scatter(iris['petal_length'], iris['petal_width'], c = kpredict, cmap = 'cool')

plt.scatter(iris['sepal_length'], iris['sepal_width'], c = kpredict, cmap = 'cool')

cluster = kmeans.labels_
cluster1 = iris.loc[cluster == 0]
cluster2 = iris.loc[cluster == 1]
cluster3 = iris.loc[cluster == 3]

cluster1.describe()

12.6

12.7

df = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

iris.drop(['species'], axis=1, inplace = True)
iris

import seaborn as sns
sns.pairplot(iris)

from sklearn.cluster import AgglomerativeClustering
groups = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='single')
groups.fit_predict(iris)

iris

plt.scatter(iris['petal_length'], iris['petal_width'], c = groups.labels_, cmap='cool')

End of Code. Return to DATAPSYC.COM .

- site last updated Aug. 3, 2022 -