Example of preparing image set for training

Imports

from  cellsegment.core import *
from  cellsegment.dataprep_utils import *
from  cellsegment.json_utils import *
from cellsegment.set_directories import *
import pandas
import numpy as np
from fastai.vision import *

Define directories

dirs = Dirs('../testdata_2')
print(dirs)
None
  basepath        :  ../testdata_2        
  crop            :  ../testdata_2/Crop-200 
  cropLabel       :  ../testdata_2/Crop-200/Label 
  cropTest        :  ../testdata_2/Crop-200/Test 
  cropTrain       :  ../testdata_2/Crop-200/Train 
  cropValidTxtFile:  ../testdata_2/Crop-200/valid.txt 
  label           :  ../testdata_2/Fullsize/Label 
  model           :  ../testdata_2/models/ 
  originImages    :  ../testdata_2/Original 
  sizeCsvFile     :  ../testdata_2/file_size.csv 
  test            :  ../testdata_2/Fullsize/Test 
  train           :  ../testdata_2/Fullsize/Train 
  validTxtFile    :  ../testdata_2/Fullsize/valid.txt 

get stats on images

show_directory_stats(dirs.originImages)
print(show_directory_stats)
../testdata_2/Original
Number of jpg files = 30
Number of json files = 30
missing: 0 []
added: 0 []
<function show_directory_stats at 0x7f6ef39800d0>

TODO Change this .... Dataset consists of legacy images and micro-i images Structure
Split into 70% Train, 15% Val, 15% Test : taken randomly for the 354 images. - Original (jpg+csv+json) [354 jpg; 354 csv; 354 json; Total = 1062]

- Fullsize    (random split into 70% Train, 15% Val, 15% Test)  
    - Train
        - (354 jpg + 354 json)
    - Label
        - (354 png)
- Crop-200
    - Error
        - nil
    - Train
        - (443 jpg)
    - Labels
        - (526 png)
    - Test
        - (83 jpg)
    - valid.txt 
        - 73 file names
print("Show original image sizes")

height = 800
fnames = sorted(get_image_files(dirs.originImages))

list_filedata = [None] * len(fnames)
for i,fn in enumerate(fnames):
    img = PIL.Image.open(fn)
    img_w, img_h = img.size
    scale = float(height) / img_h
    list_filedata[i] = {'Name':fn.name, 'Width':img_w, 'Height':img_h, 'Scale':scale}

df = pandas.DataFrame(list_filedata)
df = df[['Name', 'Width', 'Height']]
# print(df)

savefn = Path(dirs.sizeCsvFile)
if not savefn.exists():
    print(f'Saving {savefn}')
    df.to_csv(savefn)
else:
    print(f'Filename {savefn} already exists')
print(df.head())
  
print(df.tail())
Show original image sizes
Filename ../testdata_2/file_size.csv already exists
              Name  Width  Height
0   236568 - 1.jpg    826     786
1  236568 - 10.jpg    799     782
2  236568 - 11.jpg    820     805
3  236568 - 12.jpg    807     796
4  236568 - 13.jpg    812     810
              Name  Width  Height
25  236569 - 5.jpg    766     740
26  236569 - 6.jpg    762     740
27  236569 - 7.jpg    811     811
28  236569 - 8.jpg    868     795
29  236569 - 9.jpg    755     739

Optional - Convert directory of CSV files to JSON files

csv_to_json_dir(dirs.originImages, dirs.originImages, number_files='all')     
Converting an entire directory of Techion CSV files to JSON files
Number of csv & jpg files to convert 0 30
src_path ../testdata_2/Original
dest_path ../testdata_2/Original

Resize jpg & json files

resize_dir(dirs.sizeCsvFile, dirs.originImages, dirs.train, number_files='all', height=800)
Number of image files: 30, Number to resize: all
resize_json_dir(dirs.sizeCsvFile, dirs.originImages, dirs.train, number_files='all', height=800)
Number of JSON files: 30, Number to resize: all
::::::::::::::::::::::::::::::
30  json files processed
30

Create label png images

Labels centers are from Json files, store png in dest directory")

create_labels_dir(dirs.train, dirs.label, number_files='all')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-ddb9ac633057> in <module>
----> 1 create_labels_dir(dirs.train, dirs.label, number_files='all')
      2 

NameError: name 'create_labels_dir' is not defined

Split into Train, Val and Test

file_csv = dirs.basepath+'/file_data.csv'
fnames = sorted(get_image_files(dirs.train))
fnames = [fn.name for fn in fnames]
df = pandas.DataFrame(fnames)
df.columns = ['Name']
df.to_csv(file_csv, index=True)

shuffle_csv(file_csv,random_state=23)
#
print('\n Split into train valid and test directories')
split_filenames(file_csv, num_train=0.5, num_val=0.25)
#
misslist, croplist= crop_img_dir(file_csv, dirs.train, dirs.train, dirs.crop, 
                                              number_files='all', DEBUG=False)
print(f'Num Missed: {len(misslist)}, Num Cropped: {len(croplist)}')
crop_df = pandas.DataFrame(croplist)
crop_df = crop_df[['Name','Label', 'Op']]
crop_df.to_csv(dirs.crop+'/crop_df.csv')
crop_df.tail()
valid_df = crop_df[crop_df.Op=='Valid'].loc[:,'Name']
valid_df.to_csv(dirs.crop+'/valid.txt', index=False, header=True)
valid_df.head(10)
labmisslist, labcroplist = crop_img_dir(file_csv, dirs.train, dirs.label, dirs.crop, number_files='all')
print(f'Num Missed: {len(labmisslist)}, Num Cropped: {len(labcroplist)}')

Save test_data.csv

crop_df = pandas.read_csv(dirs.crop+'/crop_df.csv', index_col=0)
crop_df.loc[crop_df.Label == 40,'Label'] = 'Fluke-Rumen'
crop_df.loc[crop_df.Label == 11,'Label'] = 'Fluke-Liver'
crop_df.to_csv(dirs.crop+'/crop_df.csv')
crop_df.tail()
test_df = crop_df[crop_df.Op == 'Test'] 
test_df.to_csv(dirs.crop+'/test_df.csv', index=False)
test_df.tail()

Save label files without color pallet info

otherwise databunch segmentation does not work

fnames =  get_image_files(Path(dirs.crop)/'Label')
# fnames = fnames[:3]
for fn in fnames:
    img = np.asarray(PIL.Image.open(fn))
    PIL.Image.fromarray(img.astype(np.uint8)).save(fn, quality=90)

Test Create DataBunch

# create data set
from fastai.vision import *
from fastai.utils.mem import *
path_img = dirs.cropTrain
path_lbl = dirs.cropLabel

codes = np.array(['background', '1', '2', '3']);codes

get_label_fn = lambda x: f'{path_lbl}/{x.stem}.png'

src = (SegmentationItemList.from_folder(path_img)
       .split_by_fname_file('../valid.txt')
       .label_from_func(get_label_fn, classes=codes, convert_mode='RGB'))

tfms = get_transforms(flip_vert=True, max_rotate=None, max_zoom=1., max_warp=None)

bs = 8
data = (src.transform(tfms, tfm_y=True)
        .databunch(bs=bs)
        .normalize(imagenet_stats))

Show example images

data.show_batch(4, ds_type=DatasetType.Train, figsize=(10,10))
def acc_metric1(input, target):
    target = target.squeeze(1)
    return (input.argmax(dim=1)==target).float().mean()

def acc_metric2(input, target):
    target = target.squeeze(1)
    return (input.argmax(dim=1)[target>0]==target[target>0]).float().mean()
  
metrics=[acc_metric1, acc_metric2]

wd=1e-2
learn = unet_learner(data, models.resnet34, metrics=metrics, wd=wd).to_fp16()

run fastai learn

if torch.cuda.is_available():
    # learn.lr_find()
    # learn.recorder.plot()
    pass

All done!