Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
ADML HSLU HS22 update
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Micha Wiss
ADML HSLU HS22 update
Commits
9cff590a
Commit
9cff590a
authored
2 years ago
by
Solange Emmenegger
Browse files
Options
Downloads
Patches
Plain Diff
Auto-saving for solange.emmenegger@hslu.ch on branch master from commit
e869906d
parent
8bf57070
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
notebooks/01A Data Quality Assessment/Data Quality Assessment Examples.ipynb
+15
-3
15 additions, 3 deletions
...Quality Assessment/Data Quality Assessment Examples.ipynb
with
15 additions
and
3 deletions
notebooks/01A Data Quality Assessment/Data Quality Assessment Examples.ipynb
+
15
−
3
View file @
9cff590a
...
...
@@ -9,9 +9,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'wordcloud'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36m<cell line: 7>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlinear_model\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LinearRegression\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmetrics\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m mean_squared_error\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwordcloud\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m WordCloud\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n\u001b[1;32m 9\u001b[0m warnings\u001b[38;5;241m.\u001b[39mfilterwarnings(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'wordcloud'"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
...
...
%% Cell type:markdown id: tags:
# Data Quality Assessment
%% Cell type:code id: tags:
```
python
import
pandas
as
pd
import
numpy
as
np
import
seaborn
as
sns
import
matplotlib.pyplot
as
plt
from
sklearn.linear_model
import
LinearRegression
from
sklearn.metrics
import
mean_squared_error
from
wordcloud
import
WordCloud
import
warnings
warnings
.
filterwarnings
(
"
ignore
"
)
%
matplotlib
inline
```
%% Output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Input In [1], in
<cell
line:
7
>
()
5 from sklearn.linear_model import LinearRegression
6 from sklearn.metrics import mean_squared_error
----> 7 from wordcloud import WordCloud
8 import warnings
9 warnings.filterwarnings("ignore")
ModuleNotFoundError: No module named 'wordcloud'
%% Cell type:code id: tags:
```
python
df
=
pd
.
read_csv
(
"
cars.csv
"
)
df
.
head
()
```
%% Cell type:markdown id: tags:
## Skewed Data
%% Cell type:code id: tags:
```
python
horsepower
=
df
[
"
Horsepower
"
]
horsepower
.
plot
.
hist
()
```
%% Cell type:code id: tags:
```
python
print
(
"
Mean:
"
,
horsepower
.
mean
())
print
(
"
Mode:
"
,
int
(
horsepower
.
mode
()))
print
(
""
)
print
(
"
Mean - Mode =
"
,
horsepower
.
mean
()
-
int
(
horsepower
.
mode
()))
```
%% Cell type:markdown id: tags:
### Log-Transform Data
%% Cell type:code id: tags:
```
python
horsepower_log
=
horsepower
.
apply
(
lambda
x
:
np
.
log
(
x
))
horsepower_log
.
plot
.
hist
()
print
(
"
Mean:
"
,
horsepower_log
.
mean
())
print
(
"
Mode:
"
,
int
(
horsepower_log
.
mode
()))
print
(
"
\n
Mean - Mode =
"
,
horsepower_log
.
mean
()
-
int
(
horsepower_log
.
mode
()))
```
%% Cell type:markdown id: tags:
## Boxplots
%% Cell type:markdown id: tags:
### Horsepower
%% Cell type:code id: tags:
```
python
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
12
,
8
))
horsepower
.
plot
.
box
(
ax
=
ax
)
```
%% Cell type:code id: tags:
```
python
horsepower
.
describe
()
```
%% Cell type:code id: tags:
```
python
q3
=
horsepower
.
describe
().
loc
[
'
75%
'
]
q1
=
horsepower
.
describe
().
loc
[
'
25%
'
]
iqr
=
q3
-
q1
upper_boundary
=
q3
+
1.5
*
iqr
lower_boundary
=
q1
-
1.5
*
iqr
print
(
"
Upper boundary:
"
,
upper_boundary
,
"
Lower boundary:
"
,
lower_boundary
)
```
%% Cell type:markdown id: tags:
### Year
%% Cell type:code id: tags:
```
python
year
=
df
[
"
Year
"
]
fig
,
ax
=
plt
.
subplots
(
figsize
=
(
12
,
8
))
year
.
plot
.
box
(
ax
=
ax
)
```
%% Cell type:code id: tags:
```
python
year
.
describe
()
```
%% Cell type:code id: tags:
```
python
q3
=
year
.
describe
().
loc
[
'
75%
'
]
q1
=
year
.
describe
().
loc
[
'
25%
'
]
iqr
=
q3
-
q1
upper_boundary
=
q3
+
1.5
*
iqr
lower_boundary
=
q1
-
1.5
*
iqr
print
(
"
Upper boundary:
"
,
upper_boundary
,
"
Lower boundary:
"
,
lower_boundary
)
```
%% Cell type:markdown id: tags:
## Correlation
%% Cell type:code id: tags:
```
python
plt
.
subplots
(
figsize
=
(
10
,
8
))
sns
.
heatmap
(
df
.
corr
(),
annot
=
True
,
cmap
=
'
RdYlGn_r
'
,
linewidths
=
0.5
)
```
%% Cell type:markdown id: tags:
## Dummy Variable Trap
%% Cell type:code id: tags:
```
python
colors
=
pd
.
get_dummies
(
df
.
Color
)
colors
.
head
()
```
%% Cell type:code id: tags:
```
python
plt
.
subplots
(
figsize
=
(
10
,
8
))
sns
.
heatmap
(
colors
.
corr
(),
annot
=
True
,
cmap
=
'
RdYlGn_r
'
,
linewidths
=
0.5
)
```
%% Cell type:markdown id: tags:
### Avoid Trap
%% Cell type:code id: tags:
```
python
colors
=
pd
.
get_dummies
(
df
.
Color
,
drop_first
=
True
)
colors
.
head
()
```
%% Cell type:markdown id: tags:
## Numerical Encoding of Text
%% Cell type:markdown id: tags:
### TF-IDF
%% Cell type:code id: tags:
```
python
from
sklearn.feature_extraction.text
import
TfidfVectorizer
```
%% Cell type:code id: tags:
```
python
corpus
=
[
"
The Limmat flows out of the lake.
"
,
"
The bears are in the bear pit near the river.
"
,
"
The Rhône flows out of Lake Geneva.
"
,
]
vectorizer
=
TfidfVectorizer
()
vectors
=
vectorizer
.
fit_transform
(
corpus
)
feature_names
=
vectorizer
.
get_feature_names
()
dense
=
vectors
.
todense
().
tolist
()
pd
.
DataFrame
(
dense
,
columns
=
feature_names
).
transpose
()
```
%% Cell type:markdown id: tags:
### Word Embeddings
%% Cell type:code id: tags:
```
python
from
gensim.models
import
KeyedVectors
vectors
=
KeyedVectors
.
load
(
"
../../cc.de.300-distilled.vec
"
,
mmap
=
"
r
"
)
vectors
.
syn0norm
=
vectors
.
syn0
```
%% Cell type:code id: tags:
```
python
vectors
[
"
Mann
"
]
```
%% Cell type:code id: tags:
```
python
most_similar
=
vectors
.
wv
.
most_similar
(
positive
=
[
"
Frau
"
,
"
König
"
],
negative
=
[
"
Mann
"
])[
0
]
print
(
"
König - Mann + Frau =
"
,
most_similar
)
```
%% Cell type:markdown id: tags:
## Profile Report
%% Cell type:code id: tags:
```
python
import
pandas_profiling
profile
=
df
.
profile_report
(
html
=
{
'
style
'
:{
'
full_width
'
:
True
}})
# Save report
profile
.
to_file
(
output_file
=
"
c
"
)
profile
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment