Update 01_Setup.ipynb

b2aca0fc · pawel rosikiewicz · 1fbb8488 · b2aca0fc
Commit b2aca0fc authored 3 years ago by pawel rosikiewicz
--- a/notebooks/01_Setup.ipynb
+++ b/notebooks/01_Setup.ipynb
@@ -4,14 +4,15 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# SkinAnaliticAI, Skin Cancer Detection with AI Deep Learning\n",
+    "# SkinDiagnosticAI\n",
+    "## __FEASIBILITY STUDY__\n",
+    "## on Skin Cancer Detection with AI Deep Learning using Harvard Dataset and PYClass\n",
    "\n",
-    "## __Evaluation of Harvard Dataset with different AI classiffication techniques using FastClassAI papeline__\n",
    "Author: __Pawel Rosikiewicz__   \n",
    "prosikiewicz@gmail.com      \n",
-    "License: __MIT__    \n",
-    "ttps://opensource.org/licenses/MIT        \n",
-    "Copyright (C) 2021.01.30 Pawel Rosikiewicz                        "
+    "License: __GPL__    \n",
+    "https://opensource.org/licenses/gpl-license     \n",
+    "Copyright (C) 2021.04.26 Pawel Rosikiewicz                        "
   ]
  },
  {
@@ -470,7 +471,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.6.6"
  }
 },
 "nbformat": 4,

 %% Cell type:markdown id: tags:

-# SkinAnaliticAI, Skin Cancer Detection with AI Deep Learning
+# SkinDiagnosticAI
+## __FEASIBILITY STUDY__
+## on Skin Cancer Detection with AI Deep Learning using Harvard Dataset and PYClass

-## __Evaluation of Harvard Dataset with different AI classiffication techniques using FastClassAI papeline__
 Author: __Pawel Rosikiewicz__
 prosikiewicz@gmail.com
-License: __MIT__
-ttps://opensource.org/licenses/MIT
-Copyright (C) 2021.01.30 Pawel Rosikiewicz
+License: __GPL__
+https://opensource.org/licenses/gpl-license
+Copyright (C) 2021.04.26 Pawel Rosikiewicz

 %% Cell type:markdown id: tags:

 # PART 1. Setting Up Project Enviroment
 ---
 * __Alternative 1.__ CLONE SkinAnaliticAI projects or FastClassAI project from github
 https://github.com/PawelRosikiewicz/SkinAnaliticAI

 * __Alternative 2.__ Create file structure and copy/past content of src and notebook folders to corresponding directories,
    * For that solution, you need to create the file structure for storing scripts, notebooks, input data, etc... for FastClasAI pipeline, you may modify, basedir manually in each notebook, if necessary.
    * follow the instruction below

 ## Step 1. Create basedir file for your project, eg myproject/
 * then navigate to that file, and follow the instructions below,

 %% Cell type:markdown id: tags:

 ## Step 2. Setup FastClassAI directory structure in basedir

 %% Cell type:code id: tags:

 ``` python
 # imports,
 import os # allow changing, and navigating files and folders,
 import sys
 import re # module to use regular expressions,
 import glob # lists names in folders that match Unix shell patterns

 # basedir
 basedir = os.path.dirname(os.getcwd())
 os.chdir(basedir)
 sys.path.append(basedir)
 print(basedir) # shoudl be ../myproject/

 # create folders holing different types of data por notebooks,
 files_to_create = {
          "for whatever I dont use but wish to keep": os.path.join(basedir, "bin"),
          "for random notes and materials created on project development": os.path.join(basedir, "notes"),
          # ....
          "for jupyter notebooks": os.path.join(basedir, "notebooks"),
          "for tfhub model": os.path.join(basedir, "models"),
          # ...
          "for tools in .py format": os.path.join(basedir, "src"),
          "IMPORTANT : HERE YOU MUST COPY ALL .py FILES with my functions": os.path.join(basedir, "src/utils"),
          "for config files": os.path.join(basedir, "src/configs"),
          # ....
          "to store data and resuls": os.path.join(basedir, "data"),
          "here you will donwload raw images and other files form the source": os.path.join(basedir, "data/raw"),
          "to store matrices with extracted features": os.path.join(basedir, "data/interim"),
          "for final results": os.path.join(basedir, "data/results")
      }
 # ....
 for file_function in list(files_to_create.keys()):
    try:
        os.mkdir(files_to_create[file_function])
    except:
        print("file", file_function, " - - - was already created")
 ```

 %% Output

    /work/amld-2021-workshop
    file for whatever I dont use but wish to keep  - - - was already created
    file for random notes and materials created on project development  - - - was already created
    file for jupyter notebooks  - - - was already created
    file for tfhub model  - - - was already created
    file for tools in .py format  - - - was already created
    file IMPORTANT : HERE YOU MUST COPY ALL .py FILES with my functions  - - - was already created
    file for config files  - - - was already created
    file to store data and resuls  - - - was already created
    file here you will donwload raw images and other files form the source  - - - was already created
    file to store matrices with extracted features  - - - was already created
    file for final results  - - - was already created

 %% Cell type:markdown id: tags:

 ## Step 3. Copy/past src
 * copy/past current notbook into basedir/notebooks;
 * copy/past config files to basedir/src/configs
 * copy/past .py toolts to basedir/src/
 * if avaibale, copy/past results into basedir/data/results
 * if avaibale, copy/past downloaded tf hub models into basedir/models (each model is one folder)

 %% Cell type:markdown id: tags:

 ## Step 4. Test whether you can import one of my functions

 %% Cell type:code id: tags:

 ``` python
 # to test it, just type:
 from src.utils.feature_extraction_tools import encode_images
 ```

 %% Cell type:markdown id: tags:

 ---
 # Part 2. DOWNLOAD THE DATA AND TF-HUB MODELS FOR TRANFER LEARING
 ---

 %% Cell type:markdown id: tags:

 ## Step 1. __Download Input data__

 %% Cell type:markdown id: tags:

 ### __Dataset Description__

 * __Dataset Name__
    * The HAM10000 dataset, a large collection of multi-source dermatoscopic images of common pigmented skin lesions
 * __Source__
    * https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T
 * __Size__
    * Apoprox 3GB
 * __Dataset Version__
    * HAM10000 dataset, has only one version, at the time of this project development, that was published in 2018.
 * __Related Publications__
    * Tschandl, P., Rosendahl, C. & Kittler, H. The HAM10000 dataset, a large collection of multi-source dermatoscopic images of common pigmented skin lesions. Sci. Data 5, 180161 (2018). doi: 10.1038/sdata.2018.161 https://www.nature.com/articles/sdata2018161
 * __License__
    * Non-Commercial purposes only,
    * for more details;
        https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T


 ### __Donwload Instructions__
    * data (images and metadata) can be found at the botton of the source site, in section download
    * Harvards site, contains 6 files that can be donwloaded. The followinbg three are required for that project:
        * HAM10000_images_part_1.zip
        * HAM10000_images_part_2.zip
        * HAM10000_metadata.tab
    * Unpack the files, and store all in basedir/data/raw

 %% Cell type:markdown id: tags:

 ## Step 2. __Download Tf-hub Models used for feature extraction__

 ### NOTES
 * In order to work more reliably, I donwloaded several pretrained models for feature extraction from images, from tf-hub
 * My function, in section __Data Preparation__, can also use urls, however, it may be problematic in case of slow internet connection or reteated feature extractions perfomed on different data subsets (timeout occures frequently in these cases)
 * Important: the funciton that I implemented in section __Data Preparation__ for feature extraction, accepts models constructed with TF1 and TF2.

 ### __Module Description__

 * __Module name used in the project__
    * BiT_M   # working name resnet,
 * __Full Module Name__
    * bit_m-r101x1_1
 * __url__
    * https://tfhub.dev/google/bit/m-r101x1/1
 * __Info__
    * __Input Image size__
        * (?, 224, 224, 3)
    * __Output Feature Number__
        * (?, 2048)
    * __Short Description__
        * Big Transfer (BiT) is a recipe for pre-training image classification models on large supervised datasets and efficiently fine-tuning them on any given target task. The recipe achieves excellent performance on a wide variety of tasks, even when using very few labeled examples from the target dataset.
        * This module implements the R101x1 architecture (ResNet-101), trained to perform multi-label classification on ImageNet-21k, a dataset with 14 milion images labeled with 21,843 classes. Its outputs are the 2048-dimensional feature vectors, before the multi-label classification head. This model can be used as a feature extractor or for fine-tuning on a new target task.
        * Instructions:
            module = hub.KerasLayer("https://tfhub.dev/google/bit/m-r101x1/1")
            images = ...  # A batch of images with shape [batch_size, height, width, 3].
            features = module(images)  # Features with shape [batch_size, 2048].

 %% Cell type:markdown id: tags:

 ---
 # PART 3. Prepare Config Files - examples below
 ---
 The goal of that part is to define dataset names, dataset varinat names, what tf hub models you use, colors you asign to each class in a project etc...

 ## CONFIG FILES
 * location: __basedir/src/configs/project_configs__
 * there are 4 basic configs files that must be prepared
    * __tfhub_configs.py__
        * file that contains info on tf hub modules used for feature extraction
    * __project_configs.py__
        * basic description of the dataset
    * __dataset_configs.py__
        * contains dictionaries used to label images in each class, provide colors etc...
        * and select classes for statistics
    * __config_functions.py__
        * .py file with special functions used to select files for data processing and module training,
 * additionally there is a config file that contains model parameters used when training various ai models
    * this will be descibed later on,

 ## Notes
 * config files with CLASS_COLORS, and CLASS_DESCRIPTION, were prepared based on,
    * Links from: https://dermoscopedia.org
    * Tschandl, P., Rosendahl, C. & Kittler, H. The HAM10000 dataset, a large collection of multi-source dermatoscopic images of common pigmented skin lesions. Sci. Data 5, 180161 (2018). doi: 10.1038/sdata.2018.161 https://www.nature.com/articles/sdata2018161


 %% Cell type:markdown id: tags:

 ### Step 1. prepare tfhub_configs.py

 * this config file contains one dictiory TFHUB_MODELS
 * it is used for extracting features from images using dowlonaded tf hub modules,
 * each module has unique name and working name that may be more descriptive and used on plots,
 * the modules can be donwloaded from tf-hub and stored in basedir/models, or you may add "module_url" to each distionary that is also accepted by FastClassAI function,

 %% Cell type:code id: tags:

 ``` python
 # config, ...........................................................................................
 # Purpose: create config file for tf-hub module used,
 # Localization: tfhub_configs.py
 # values:
 #         "module_name"    : str, name used on plots, and for file saving
 #         "working_name"   : str, alternative to module_name (eg shorter), not used in my projects
 #         "file_name"      : str, the name of the file donwloaded from tfhub, wiht a given module, (can be custom)
 #         "module_url"     : str, url, to the module on tfhub
 #         "input_size"     : tuple,  (height, width) in pixes
 #         "output_size"    : int,  lenght of vector with extracted features,
 #         "note"           : str, notes, whatether you this is important, for other users
 #
 #  IMPORTANT, KEY NAME MUST BE THE SAME AS module name

 # here is an example for BiT_M_Resnet101 module
 TFHUB_MODELS = {
    "BiT_M_Resnet101":{
        "module_name": "BiT_M_Resnet101",
        "working_name": "resnet",
        "file_name":  "bit_m-r101x1_1",
        "module_url":"https://tfhub.dev/google/bit/m-r101x1/1",
        "input_size":  (224, 224),
        "output_size": 2048,
        "note":"tested on swissroads dataset, where it worked very well"
        }
 }# end
 ```

 %% Cell type:markdown id: tags:

 ### create project_configs.py
 * two variables are the most important:
    * PROJECT_NAME : just a string with a solid project name that will be usxed in the project
    * CLASS_DESCRIPTION : that contains description of each class in the original data, plus extra information such as links to external datasources, and class_description (created manually) that may be very usefull later on in the project, while evaliating the results or in EDA

 %% Cell type:code id: tags:

 ``` python
 # config, ...........................................................................................
 PROJECT_NAME = "SkinAnaliticAI_Harvard_dataset_evaluation"


 # config, ...........................................................................................
 # CLASS_DESCRIPTION
 # Purpose: information on each class, used for creating new class arrangment and for providing info on each class,
 # Localization: project_configs.py
 #
 #. "key"                   :  str, class name used in original dataset downloaded form databse
 #      "original_name"     :  str, same as the key, but you can introduce other values in case its necessarly
 #      "class_full_name"   :  str, class name used on images, saved data etc, (more descriptive then class names, or sometimes the same according to situation)
 #      "class_group"       :  str, group of classes, if the classes are hierarchical,
 #      "class_description" :  str, used as notes, or for class description available for the user/client
 #      "links"             :  list,  with link to more data, on each class

 CLASS_DESCRIPTION = {
  'akiec':{
    "original_name":'akiec',
    "class_full_name": "squamous_cell_carcinoma", # prevoisly called "Actinic_keratoses" in my dataset, but ths name is easier to find in online resourses, noth names are correct,
    "class_group": "Tumour_Benign",
    "class_description": "Class that contains two subclasses:(A) Actinic_Keratoses or (B) Bowen’s disease. Actinic Keratoses (Solar Keratoses) and Intraepithelial Carcinoma (Bowen’s disease) are common non-invasive, variants of squamous cell carcinoma that can be treated locally without surgery. These lesions may progress to invasive squamous cell carcinoma – which is usually not pigmented. Both neoplasms commonly show surface scaling and commonly are devoid of pigment, Actinic keratoses are more common on the face and Bowen’s disease is more common on other body sites. Because both types are induced by UV-light the surrounding skin is usually typified by severe sun damaged except in cases of Bowen’s disease that are caused by human papilloma virus infection and not by UV. Pigmented variants exist for Bowen’s disease and for actinic keratoses",
    "links":["https://dermoscopedia.org/Actinic_keratosis_/_Bowen%27s_disease_/_keratoacanthoma_/_squamous_cell_carcinoma"]
    }
   # 6 more classes follow ...
 }
 ```

 %% Cell type:markdown id: tags:

 ### create dataset_configs.py
 * this is the config file with the largest number of variables,
 * it contains information on
    * DROPOUT_VALUE : a keword/value that can be introduced to batch labels and will be recognised by FastClassAI function to not use images labelled like that for model training, eg to undersample one or more classes, or to exlude images from some classes in model training,

    * CLASS_COLORS
        * a dictiionary with colors assigned to original class labels,
        * key: original class label, value: color (any name accepted nby Matlotlib)


    * CLASS_COLORS_zorder
        * because some classes can be merged to build larger classes in different dataset variants,
        I created that variale to assign proper colors to a class that emerges from joingin these towo or more classes,
        * eg if we join class 1: yellow (zorder=1), and class 2: blue (zorder=100), new class will have blue color,

   * CLASS_LABELS_CONFIGS


 %% Cell type:code id: tags:

 ``` python
 # configs .......................................................
 DROPOUT_VALUE = "to_dropout"


 # configs .......................................................
 '''
    colors assigned to original class labels,
 '''
 CLASS_COLORS ={
 'bkl': 'orange',
 'nv': 'forestgreen',
 'df': 'purple',
 'mel': 'black',
 'Vasc': 'red',
 'bcc': 'dimgrey',
 'akiec': 'steelblue'}


 # configs .......................................................
 CLASS_COLORS_zorder ={
 'bkl': 300,
 'nv':  500,
 'df': 1,
 'mel': 200,
 'Vasc': 1,
 'bcc': 1,
 'akiec': 1}


 # configs .......................................................
 DATASET_CONFIGS = {
 "HAM10000": {
     "info": "raw data grouped with original classes, no augmentation, duplicates were removed",
     "labels": ["Cancer_Detection_And_Classification",
                "Cancer_Risk_Groups",
                "Melanoma_Detection",
                "Skin_Cancer_Detection",
                "Cancer_Classification"
               ]}}

 # configs .......................................................
 # CLASS_LABELS_CONFIGS
 #   key                          : str, name of the classyficaiton system used
 #            "info"              : str, notes for the user
 #            "class_labels_dict" : dict, key: original class label, value: labels used in that classyficaiton system
 #.   "melanoma_stat_labels_dict" : dict, custom dict, added to allow caulating accuracy statistucs, with one class containigni melanoma (POSITIVE),
 #                                 vs all other classes designated as NEGATIVE
 CLASS_LABELS_CONFIGS = {
  "Cancer_Detection_And_Classification":{
      "info":"more informative class names for raw data",

      "class_labels_dict":{
         'akiec': 'Squamous_cell_carcinoma',
         'bcc': 'Basal_cell_carcinoma',
         'bkl': 'Benign_keratosis',
         'df': 'Dermatofibroma',
         'nv': 'Melanocytic_nevus',
         'mel': 'Melanoma',
         'Vasc': 'Vascular_skin_lesions'},

      "melanoma_stat_labels_dict":{
         'Squamous_cell_carcinoma': 'NEGATIVE',
         'Basal_cell_carcinoma': 'NEGATIVE',
         'Benign_keratosis': 'NEGATIVE',
         'Dermatofibroma': 'NEGATIVE',
         'Melanocytic_nevus': 'NEGATIVE',
         'Vascular_skin_lesions':'NEGATIVE',
         'Melanoma': 'POSITIVE'}
    },

  "Cancer_Risk_Groups":{
      "info":"""
               7 original classes were grouped into three oncological risk groups
               with vasc&nv assigned into low lever skin lessions, all other cancer types into cancer benign,
               and melanoma as separate category
              """,

      "class_labels_dict":{
         'akiec': 'Medium-benign_cancer',
         'bcc': 'Medium-benign_cancer',
         'bkl': 'Medium-benign_cancer',
         'df': 'Medium-benign_cancer',
         'nv': 'Low-skin_lession',
         'mel': 'High-melanoma',
         'Vasc': 'Low-skin_lession'},

      "melanoma_stat_labels_dict":{
         'Low-skin_lession': 'NEGATIVE',
         'Medium-benign_cancer': 'NEGATIVE',
         'High-melanoma': 'POSITIVE'}
    }
 }
 ```