Skip to content
Snippets Groups Projects
Commit 9cff590a authored by Solange Emmenegger's avatar Solange Emmenegger
Browse files

Auto-saving for solange.emmenegger@hslu.ch on branch master from commit e869906d

parent 8bf57070
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Data Quality Assessment # Data Quality Assessment
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from wordcloud import WordCloud from wordcloud import WordCloud
import warnings import warnings
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
%matplotlib inline %matplotlib inline
``` ```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Input In [1], in <cell line: 7>()
5 from sklearn.linear_model import LinearRegression
6 from sklearn.metrics import mean_squared_error
----> 7 from wordcloud import WordCloud
8 import warnings
9 warnings.filterwarnings("ignore")
ModuleNotFoundError: No module named 'wordcloud'
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df = pd.read_csv("cars.csv") df = pd.read_csv("cars.csv")
df.head() df.head()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Skewed Data ## Skewed Data
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
horsepower = df["Horsepower"] horsepower = df["Horsepower"]
horsepower.plot.hist() horsepower.plot.hist()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print("Mean:", horsepower.mean()) print("Mean:", horsepower.mean())
print("Mode:", int(horsepower.mode())) print("Mode:", int(horsepower.mode()))
print("") print("")
print("Mean - Mode = ", horsepower.mean() - int(horsepower.mode())) print("Mean - Mode = ", horsepower.mean() - int(horsepower.mode()))
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Log-Transform Data ### Log-Transform Data
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
horsepower_log = horsepower.apply(lambda x: np.log(x)) horsepower_log = horsepower.apply(lambda x: np.log(x))
horsepower_log.plot.hist() horsepower_log.plot.hist()
print("Mean:", horsepower_log.mean()) print("Mean:", horsepower_log.mean())
print("Mode:", int(horsepower_log.mode())) print("Mode:", int(horsepower_log.mode()))
print("\nMean - Mode = ", horsepower_log.mean() - int(horsepower_log.mode())) print("\nMean - Mode = ", horsepower_log.mean() - int(horsepower_log.mode()))
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Boxplots ## Boxplots
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Horsepower ### Horsepower
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
fig, ax = plt.subplots(figsize=(12,8)) fig, ax = plt.subplots(figsize=(12,8))
horsepower.plot.box(ax=ax) horsepower.plot.box(ax=ax)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
horsepower.describe() horsepower.describe()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
q3 = horsepower.describe().loc['75%'] q3 = horsepower.describe().loc['75%']
q1 = horsepower.describe().loc['25%'] q1 = horsepower.describe().loc['25%']
iqr = q3 - q1 iqr = q3 - q1
upper_boundary = q3 + 1.5 * iqr upper_boundary = q3 + 1.5 * iqr
lower_boundary = q1 - 1.5 * iqr lower_boundary = q1 - 1.5 * iqr
print("Upper boundary:", upper_boundary, "Lower boundary:", lower_boundary) print("Upper boundary:", upper_boundary, "Lower boundary:", lower_boundary)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Year ### Year
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
year = df["Year"] year = df["Year"]
fig, ax = plt.subplots(figsize=(12,8)) fig, ax = plt.subplots(figsize=(12,8))
year.plot.box(ax=ax) year.plot.box(ax=ax)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
year.describe() year.describe()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
q3 = year.describe().loc['75%'] q3 = year.describe().loc['75%']
q1 = year.describe().loc['25%'] q1 = year.describe().loc['25%']
iqr = q3 - q1 iqr = q3 - q1
upper_boundary = q3 + 1.5 * iqr upper_boundary = q3 + 1.5 * iqr
lower_boundary = q1 - 1.5 * iqr lower_boundary = q1 - 1.5 * iqr
print("Upper boundary:", upper_boundary, "Lower boundary:", lower_boundary) print("Upper boundary:", upper_boundary, "Lower boundary:", lower_boundary)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Correlation ## Correlation
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
plt.subplots(figsize=(10, 8)) plt.subplots(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='RdYlGn_r', linewidths=0.5) sns.heatmap(df.corr(), annot=True, cmap='RdYlGn_r', linewidths=0.5)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Dummy Variable Trap ## Dummy Variable Trap
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
colors = pd.get_dummies(df.Color) colors = pd.get_dummies(df.Color)
colors.head() colors.head()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
plt.subplots(figsize=(10, 8)) plt.subplots(figsize=(10, 8))
sns.heatmap(colors.corr(), annot=True, cmap='RdYlGn_r', linewidths=0.5) sns.heatmap(colors.corr(), annot=True, cmap='RdYlGn_r', linewidths=0.5)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Avoid Trap ### Avoid Trap
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
colors = pd.get_dummies(df.Color, drop_first=True) colors = pd.get_dummies(df.Color, drop_first=True)
colors.head() colors.head()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Numerical Encoding of Text ## Numerical Encoding of Text
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### TF-IDF ### TF-IDF
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
corpus = [ corpus = [
"The Limmat flows out of the lake.", "The Limmat flows out of the lake.",
"The bears are in the bear pit near the river.", "The bears are in the bear pit near the river.",
"The Rhône flows out of Lake Geneva.", "The Rhône flows out of Lake Geneva.",
] ]
vectorizer = TfidfVectorizer() vectorizer = TfidfVectorizer()
vectors= vectorizer.fit_transform(corpus) vectors= vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names() feature_names = vectorizer.get_feature_names()
dense = vectors.todense().tolist() dense = vectors.todense().tolist()
pd.DataFrame(dense, columns=feature_names).transpose() pd.DataFrame(dense, columns=feature_names).transpose()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Word Embeddings ### Word Embeddings
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from gensim.models import KeyedVectors from gensim.models import KeyedVectors
vectors = KeyedVectors.load("../../cc.de.300-distilled.vec", mmap="r") vectors = KeyedVectors.load("../../cc.de.300-distilled.vec", mmap="r")
vectors.syn0norm = vectors.syn0 vectors.syn0norm = vectors.syn0
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
vectors["Mann"] vectors["Mann"]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
most_similar = vectors.wv.most_similar(positive=["Frau", "König"], negative=["Mann"])[0] most_similar = vectors.wv.most_similar(positive=["Frau", "König"], negative=["Mann"])[0]
print("König - Mann + Frau = ", most_similar) print("König - Mann + Frau = ", most_similar)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Profile Report ## Profile Report
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas_profiling import pandas_profiling
profile = df.profile_report(html={'style':{'full_width':True}}) profile = df.profile_report(html={'style':{'full_width':True}})
# Save report # Save report
profile.to_file(output_file="c") profile.to_file(output_file="c")
profile profile
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment