Auto-saving for solange.emmenegger@hslu.ch on branch master from commit e869906d

9cff590a · Solange Emmenegger · 8bf57070 · 9cff590a
Commit 9cff590a authored 2 years ago by Solange Emmenegger
--- a/notebooks/01A Data Quality Assessment/Data Quality Assessment Examples.ipynb
+++ b/notebooks/01A Data Quality Assessment/Data Quality Assessment Examples.ipynb
@@ -9,9 +9,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'wordcloud'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36m<cell line: 7>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlinear_model\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LinearRegression\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m mean_squared_error\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwordcloud\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m WordCloud\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n\u001b[1;32m      9\u001b[0m warnings\u001b[38;5;241m.\u001b[39mfilterwarnings(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'wordcloud'"
+     ]
+    }
+   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",

 %% Cell type:markdown id: tags:
 # Data Quality Assessment
 %% Cell type:code id: tags:
 ``` python
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import mean_squared_error
 from wordcloud import WordCloud
 import warnings
 warnings.filterwarnings("ignore")
 %matplotlib inline
 ```
+%% Output
+    ---------------------------------------------------------------------------
+    ModuleNotFoundError                       Traceback (most recent call last)
+Input     In [1], in <cell line: 7>()
+          5 from sklearn.linear_model import LinearRegression
+          6 from sklearn.metrics import mean_squared_error
+    ----> 7 from wordcloud import WordCloud
+          8 import warnings
+          9 warnings.filterwarnings("ignore")
+    ModuleNotFoundError: No module named 'wordcloud'
 %% Cell type:code id: tags:
 ``` python
 df = pd.read_csv("cars.csv")
 df.head()
 ```
 %% Cell type:markdown id: tags:
 ## Skewed Data
 %% Cell type:code id: tags:
 ``` python
 horsepower = df["Horsepower"]
 horsepower.plot.hist()
 ```
 %% Cell type:code id: tags:
 ``` python
 print("Mean:", horsepower.mean())
 print("Mode:", int(horsepower.mode()))
 print("")
 print("Mean - Mode = ", horsepower.mean() - int(horsepower.mode()))
 ```
 %% Cell type:markdown id: tags:
 ### Log-Transform Data
 %% Cell type:code id: tags:
 ``` python
 horsepower_log = horsepower.apply(lambda x: np.log(x))
 horsepower_log.plot.hist()
 print("Mean:", horsepower_log.mean())
 print("Mode:", int(horsepower_log.mode()))
 print("\nMean - Mode = ", horsepower_log.mean() - int(horsepower_log.mode()))
 ```
 %% Cell type:markdown id: tags:
 ## Boxplots
 %% Cell type:markdown id: tags:
 ### Horsepower
 %% Cell type:code id: tags:
 ``` python
 fig, ax = plt.subplots(figsize=(12,8))
 horsepower.plot.box(ax=ax)
 ```
 %% Cell type:code id: tags:
 ``` python
 horsepower.describe()
 ```
 %% Cell type:code id: tags:
 ``` python
 q3 = horsepower.describe().loc['75%']
 q1 = horsepower.describe().loc['25%']
 iqr = q3 - q1
 upper_boundary = q3 + 1.5 * iqr
 lower_boundary = q1 - 1.5 * iqr
 print("Upper boundary:", upper_boundary, "Lower boundary:", lower_boundary)
 ```
 %% Cell type:markdown id: tags:
 ### Year
 %% Cell type:code id: tags:
 ``` python
 year = df["Year"]
 fig, ax = plt.subplots(figsize=(12,8))
 year.plot.box(ax=ax)
 ```
 %% Cell type:code id: tags:
 ``` python
 year.describe()
 ```
 %% Cell type:code id: tags:
 ``` python
 q3 = year.describe().loc['75%']
 q1 = year.describe().loc['25%']
 iqr = q3 - q1
 upper_boundary = q3 + 1.5 * iqr
 lower_boundary = q1 - 1.5 * iqr
 print("Upper boundary:", upper_boundary, "Lower boundary:", lower_boundary)
 ```
 %% Cell type:markdown id: tags:
 ## Correlation
 %% Cell type:code id: tags:
 ``` python
 plt.subplots(figsize=(10, 8))
 sns.heatmap(df.corr(), annot=True, cmap='RdYlGn_r', linewidths=0.5)
 ```
 %% Cell type:markdown id: tags:
 ## Dummy Variable Trap
 %% Cell type:code id: tags:
 ``` python
 colors = pd.get_dummies(df.Color)
 colors.head()
 ```
 %% Cell type:code id: tags:
 ``` python
 plt.subplots(figsize=(10, 8))
 sns.heatmap(colors.corr(), annot=True, cmap='RdYlGn_r', linewidths=0.5)
 ```
 %% Cell type:markdown id: tags:
 ### Avoid Trap
 %% Cell type:code id: tags:
 ``` python
 colors = pd.get_dummies(df.Color, drop_first=True)
 colors.head()
 ```
 %% Cell type:markdown id: tags:
 ## Numerical Encoding of Text
 %% Cell type:markdown id: tags:
 ### TF-IDF
 %% Cell type:code id: tags:
 ``` python
 from sklearn.feature_extraction.text import TfidfVectorizer
 ```
 %% Cell type:code id: tags:
 ``` python
 corpus = [
    "The Limmat flows out of the lake.",
    "The bears are in the bear pit near the river.",
    "The Rhône flows out of Lake Geneva.",
    ]
 vectorizer = TfidfVectorizer()
 vectors= vectorizer.fit_transform(corpus)
 feature_names = vectorizer.get_feature_names()
 dense = vectors.todense().tolist()
 pd.DataFrame(dense, columns=feature_names).transpose()
 ```
 %% Cell type:markdown id: tags:
 ### Word Embeddings
 %% Cell type:code id: tags:
 ``` python
 from gensim.models import KeyedVectors
 vectors = KeyedVectors.load("../../cc.de.300-distilled.vec", mmap="r")
 vectors.syn0norm = vectors.syn0
 ```
 %% Cell type:code id: tags:
 ``` python
 vectors["Mann"]
 ```
 %% Cell type:code id: tags:
 ``` python
 most_similar = vectors.wv.most_similar(positive=["Frau", "König"], negative=["Mann"])[0]
 print("König - Mann + Frau = ", most_similar)
 ```
 %% Cell type:markdown id: tags:
 ## Profile Report
 %% Cell type:code id: tags:
 ``` python
 import pandas_profiling
 profile = df.profile_report(html={'style':{'full_width':True}})
 # Save report
 profile.to_file(output_file="c")
 profile
 ```