Source code for slide

# -*- coding: utf-8 -*-

"""
Handle annotated whole-slide images (WSI; also known as virtual slides).

"""

import warnings
from distutils.version import StrictVersion
import random

import numpy as np
import openslide
import cv2

import annotation as reader
import tissue


class _AnnotatedOpenSlide(openslide.OpenSlide):
    """An open *annotated* WSI.

    Wrapper of `openslide.OpenSlide` to handle annotation polygons together
    with their corresponding OpenSlide Python WSI instances.

    Parameters
    ----------
    filename: str
        Path to slide file.
    annotation_filename: str
        Path to XML annotation file.
    xml_style: str
        The annotation XML style. Typically associated with a computational
        histology challenge releasing the dataset.
        * 'asap': CAMELYON grand challenges in pathology.
        * 'bach': BACH Grand Challenge on Breast Cancer Histology Images.

    Attributes
    ----------
    filename: str
        The WSI file name to read.
    annotation_filename: str
        Corresponding XML annotation file.
    polygons: list of lists of 2-tuples
        Polygon region annotations.
    labels: list of str
        Polygon region annotation labels.
    label_map: dict {str: int}
        Correspondence between polygon region annotation labels and integer
        values used in numpy masks.

    """

    def __init__(self, filename, annotation_filename, xml_style):
        openslide.OpenSlide.__init__(self, filename)
        self.filename = filename  # Useful to name predicted annotations
        self.annotation_filename = annotation_filename
        self.xml_style = xml_style
        self.polygons = None
        self.label_map = None

        # Label annotations as:
        #   1: Normal tissue
        #   2: Tumor tissue
        # 0 is automatically assigned to all unannotated pixels
        if self.annotation_filename is not None:
            if self.xml_style == 'asap':
                self.polygons, self.labels = reader.asap_annotations(
                    self.annotation_filename)
                # CAMELYON17 data
                if 'metastases' in self.labels:
                    # Avoid value 0 (used by default for unlabeled regions)
                    self.label_map = {'metastases': 2, 'normal': 1}
                # CAMELYON16 data
                elif '_0' in self.labels or '_1' in self.labels:
                    self.label_map = {'_0': 2, '_1': 2, '_2': 1}
                elif 'Tumor' in self.labels or 'Exclusion' in self.labels:
                    self.label_map = {'Tumor': 2, 'Exclusion': 1}
                else:  # Predicted annotations
                    self.label_map = {'predicted_tumor': 1}
            elif self.xml_style == 'bach':
                # Value 1 is reserved for 'normal' tissue annotations
                self.label_map = {'Benign': 2, 'Carcinoma in situ': 3,
                                  'Invasive carcinoma': 4}
                self.polygons, self.labels = reader.bach_annotations(
                    self.annotation_filename)
            else:
                raise ValueError(
                    '"xml_style" value must be either "asap" or "bach".')
        else:
            if self.xml_style is not None:
                warnings.warn('"xml_style" is only used if an ' +
                              '"annotation_filename" is provided.')


[docs]class Slide(_AnnotatedOpenSlide): """An open WSI, with or without annotations. Added functionality to OpenSlide WSI instances, optionally with annotations. Parameters ---------- filename: str Path to slide file. Namesake argument to `openslide.OpenSlide` and `AnnotatedOpenSlide`. annotation_filename: str Path to XML annotation file. Namesake argument to `AnnotatedOpenSlide`. xml_style: str The computational histology challenge releasing the dataset. Namesake argument to `AnnotatedOpenSlide`. Attributes ---------- tissue_mask: Numpy 2D array Tissue RoI ``annotation``. tissue_label_map: dict {str: int} Correspondence between polygon region annotation labels and integer values used in numpy masks. downsampling_factor: float The scaling factor relative to level 0 dimensions. downsampled_slide: PIL image The scaled down slide generated when running the tissue detector. Examples -------- >>> from wsipre import slide >>> wsi = slide.Slide('tumor_001.tif', 'tumor_001.xml', 'asap') >>> wsi.label_map {'_0': 2, '_1': 2, '_2': 1} >>> wsi.level_count 10 >>> len(wsi.polygons) 2 >>> thumbnail, mask, dwnspl_factor = wsi.get_thumbnail_with_annotation( ... size=(3000, 3000), polygon_type='line', line_thickness=5) >>> slide_region = wsi.read_region_with_annotation( ... location=(65000, 110000), level=4, size=(1000, 1000)) """ def __init__(self, filename, annotation_filename=None, xml_style=None): self.tissue_mask = None self.tissue_label_map = None self.downsampling_factor = None self.downsampled_slide = None # OpenSlide version < 3.4.1 lacks support for some recent file formats version = openslide.__library_version__ if StrictVersion(version) < StrictVersion('3.4.1'): warnings.warn( f'OpenSlide version {version} lacks support for some ' + 'whole-slide image file formats. It is highly recommended ' + 'to update to a more recent version.') if annotation_filename is None: if xml_style is not None: warnings.warn( '"xml_style" is not used (no annotation was provided).') openslide.OpenSlide.__init__(self, filename) else: _AnnotatedOpenSlide.__init__( self, filename, annotation_filename, xml_style) def _draw_polygons(self, mask, polygons, polygon_type, line_thickness=None): """Convert polygon vertex coordinates to polygon drawing.""" for poly, label in zip(polygons, self.labels): if polygon_type == 'line': mask = cv2.polylines(mask, [poly], True, int(self.label_map[label]), line_thickness) elif polygon_type == 'area': if line_thickness is not None: warnings.warn('"line_thickness" is only used if ' + '"polygon_type" is "line".') mask = cv2.fillPoly(mask, [poly], int(self.label_map[label])) else: raise ValueError( 'Accepted "polygon_type" values are "line" or "area".') return mask def _draw_tissue_polygons(self, mask, polygons, polygon_type, line_thickness=None): """Convert tissue polygon vertex coordinates to polygon drawing.""" tissue_label = 1 for poly in polygons: if polygon_type == 'line': mask = cv2.polylines( mask, [poly], True, tissue_label, line_thickness) elif polygon_type == 'area': if line_thickness is not None: warnings.warn('"line_thickness" is only used if ' + '"polygon_type" is "line".') mask = cv2.fillPoly(mask, [poly], tissue_label) else: raise ValueError( 'Accepted "polygon_type" values are "line" or "area".') return mask
[docs] def get_thumbnail_with_annotation(self, size, polygon_type='area', line_thickness=None): """Convert *annotated* WSI to a scaled-down thumbnail. Parameters ---------- size: 2-tuple Maximum size of compressed image thumbnail as (width, height). polygon_type: str Type of polygon drawing: * 'line' * 'area' line_thickness: int Polygon edge line thickness. Required if ``polygon_type`` is ``line``. Returns ------- Scaled down slide (as PIL RGBA image) and annotation mask (as numpy 2D array), plus the scaling factor. """ if self.polygons is None: raise ValueError( 'No annotation is available. Please load an annotation ' + 'or consider using "get_thumbnail" instead.') thumb = self.get_thumbnail(size) width_factor = self.dimensions[0] / size[0] height_factor = self.dimensions[1] / size[1] downsampling_factor = max(width_factor, height_factor) polygons = np.array([np.array(poly) for poly in self.polygons]) polygons = polygons / downsampling_factor polygons = [poly.astype('int32') for poly in polygons] mask = np.zeros(thumb.size[::-1]) # Reverse width and height mask = self._draw_polygons( mask, polygons, polygon_type, line_thickness) return thumb, mask, downsampling_factor
[docs] def read_region_with_annotation(self, location, level, size, polygon_type='area', line_thickness=None): """Crop a smaller region from an *annotated* WSI. Get a defined region from a WSI, together with its annotation mask. Parameters ---------- location: 2-tuple X and y coordinates of the top left pixel. Namesake argument to OpenSlide's "read_region" function. level: int Slide level. Namesake argument to OpenSlide's "read_region" function. size: 2-tuple Crop size (width, height). Namesake argument to OpenSlide's "read_region" function. polygon_type: str Type of polygon drawing: * 'line' * 'area' line_thickness: int Polygon edge line thickness. Required if 'polygon_type' is 'line'. Returns ------- Image region (as PIL RGBA image) and the annotation mask (as numpy 2d array). """ if self.polygons is None: raise ValueError( 'No annotation is available. Please load an annotation ' + 'or consider using "read_region" instead.') downsampling_factor = self.level_downsamples[level] slide_region = self.read_region(location, level, size) polygons = np.array([np.array(poly) for poly in self.polygons]) polygons = polygons / downsampling_factor polygons = [poly.astype('int32') for poly in polygons] location = tuple(int(coord / downsampling_factor) for coord in location) # Convert polygon coordinates to patch coordinates polygons = [poly - np.array(location) for poly in polygons] mask_region = np.zeros(size[::-1]) mask_region = self._draw_polygons( mask_region, polygons, polygon_type, line_thickness) return slide_region, mask_region
[docs] def get_tissue_mask(self, downsampling_factor=64, polygon_type='area', line_thickness=None): """Generate tissue region annotation. Make a binary mask annotating tissue regions of interest (RoI) on the WSI using an automatic threshold-based segmentation method inspired by the one used by Wang *et al.* [1]_. Briefly, the method consists on the following steps, starting from a WSI: * Select downsampling level (typically a factor of 64) * Transfer from the RGB to the HSV color space * Determine optimal threshold value in the saturation channel using the Otsu algorithm [2]_ * Threshold image to generate a binary mask * Fill in small holes and remove small objects .. [1] Dayong Wang, Aditya Khosla, Rishab Gargeya, Humayun Irshad, Andrew H. Beck, "Deep Learning for Identifying Metastatic Breast Cancer", arXiv:1606.05718. .. [2] Nobuyuki Otsu, "A Threshold Selection Method from Gray-level Histograms", IEEE Trans Syst Man Cybern., 9(1):62–66, 1979. Parameters ---------- downsampling_factor: int The desired factor to downsample the image by, since full WSIs will not fit in memory. The image's closest level downsample is found and used. polygon_type: str Type of polygon drawing: * 'line' * 'area' line_thickness: int Polygon edge line thickness. Required if ``polygon_type`` is ``line``. Returns ------- Binary mask as numpy 2D array, RGB slide image (in the used downsampling level, to allow visualization) and downsampling factor. """ (mask_contours, self.downsampled_slide, self.downsampling_factor) = tissue.detect_tissue( self, downsampling_factor) mask = np.zeros(self.downsampled_slide.shape[:2]) self.tissue_mask = self._draw_tissue_polygons( mask, mask_contours, polygon_type, line_thickness) self.tissue_label_map = {'background': 0, 'tissue': 1} return self
def _get_random_coordinates(self, pixels_in_roi, level, mask_downsampling_factor, size): """Get region (patch) coordinates at random.""" coordinates = random.choice(pixels_in_roi) # Scale coordinates up to level-0 dimensions coordinates = [int(x * mask_downsampling_factor) for x in coordinates] # OpenSlide Python's read_region takes top left corner position, which # effectively excludes tissue above and to the left of annotations # Fix it by offsetting coordinates: convert to center pixel position # (subtract half of patch size from x and y coordinates) half_size = [x * self.level_downsamples[level] / 2 for x in size] row_location = coordinates[0] - half_size[0] col_location = coordinates[1] - half_size[1] # Make sure coordinates are still within slide margins cols_edge = self.dimensions[0] - size[0] rows_edge = self.dimensions[1] - size[1] row_location = max(0, min(row_location, rows_edge)) col_location = max(0, min(col_location, cols_edge)) return int(col_location), int(row_location) # OpenSlide: width, height def _get_pixels_avoiding_labels(self, avoid_labels): # Can only do this if an annotation is available if self.polygons is None: raise ValueError('No annotation is available.') # Generate annotated thumbnail with exact dimensions of tissue mask rows, cols = self.tissue_mask.shape _, mask, _ = self.get_thumbnail_with_annotation( size=(cols, rows), polygon_type='area') # Locate pixels in tissue and not in "avoid_labels" new_mask = np.where(self.tissue_mask == 1, 1, mask) new_mask = np.where(np.isin(mask, avoid_labels), 2, new_mask) return tuple(zip(*np.where(new_mask == 1))) def _max_repeated_pixel_ratio(self, image): """Compute ratio of count of most common pixel value to total count.""" image = np.array(image) _, counts = np.unique(image, return_counts=True) return np.max(counts) / image.size def _get_area_ratio(self, mask, target_class): """Compute ratio of pixels labeled as 'target_class' to all pixels.""" class_pixels = mask[np.where(mask == target_class)].size return class_pixels / mask.size
[docs] def read_random_tissue_patch(self, level, size, avoid_labels=None): """Crop random patch from detected tissue on WSI. Parameters ---------- level: int Slide level. Namesake argument to OpenSlide's "read_region" function. size: 2-tuple Crop size (width, height). Namesake argument to OpenSlide's "read_region" function. avoid_labels: iterable Labels from provided annotation to avoid when targeting tissue. An annotation is must be available. Returns ------- Image region or patch as PIL RGBA image (plus annotation mask as 2D Numpy array if labels to avoid are provided). """ if self.tissue_mask is None: self.get_tissue_mask() if avoid_labels is None: # Select coordinates matching tissue (labeled as 1) pixels_in_roi = tuple(zip(*np.where(self.tissue_mask == 1))) else: # Select coordinates matching tissue (labeled as 1) and avoiding # indicated labels # Can only do this if an annotation is available if self.polygons is None: raise ValueError('No annotation is available.') # Generate annotated thumbnail with exact dimensions of tissue mask rows, cols = self.tissue_mask.shape _, mask, _ = self.get_thumbnail_with_annotation( size=(cols, rows), polygon_type='area') # Locate pixels in tissue and not in "avoid_labels" new_mask = np.where(self.tissue_mask == 1, 1, mask) new_mask = np.where(np.isin(mask, avoid_labels), 2, new_mask) pixels_in_roi = tuple(zip(*np.where(new_mask == 1))) coordinates = self._get_random_coordinates( pixels_in_roi, level, self.downsampling_factor, size) patch = self.read_region(coordinates, level, size) # Avoid false positive regions where a large proportion of pixels is # exactly the same while self._max_repeated_pixel_ratio(patch) > 0.5: coordinates = self._get_random_coordinates( pixels_in_roi, level, self.downsampling_factor, size) patch = self.read_region(coordinates, level, size) # Also, make sure the patch is mostly normal tissue # (or backgound, not actually excluding it here...) if avoid_labels is not None: patch, mask = self.read_region_with_annotation( coordinates, level, size, polygon_type='area') avoid_ratios = [self._get_area_ratio(mask, x) for x in avoid_labels] i = 0 while any(x > 0.1 for x in avoid_ratios): i += 1 coordinates = self._get_random_coordinates( pixels_in_roi, level, self.downsampling_factor, size) patch, mask = self.read_region_with_annotation( coordinates, level, size, polygon_type='area') avoid_ratios = [self._get_area_ratio(mask, x) for x in avoid_labels] if i > 15: raise ValueError( 'Cannot seem to find patch matching requirements.') return patch, mask return patch
def _pick_random_coordinates(self, level, size, target_class): """Pick random patch to crop from WSI.""" # Get thumbnail and select coordinates at random _, mask, downsampling_factor = self.get_thumbnail_with_annotation( size=(5000, 5000), polygon_type='area') # Select coordinates matching target class pixels_in_roi = tuple(zip(*np.where(mask == target_class))) coordinates = self._get_random_coordinates( pixels_in_roi, level, downsampling_factor, size) # Get ratio (polygon type must be 'area') _, mask = self.read_region_with_annotation( coordinates, level, size, polygon_type='area') area_ratio = self._get_area_ratio(mask, target_class) return coordinates, area_ratio
[docs] def read_random_patch(self, level, size, target_class, min_class_area_ratio, polygon_type='area', line_thickness=None): """Crop random patch from WSI. Select random location within target class according to provided annotation. Parameters ---------- level: int Slide level. Namesake argument to OpenSlide's "read_region" function. size: 2-tuple Crop size (width, height). Namesake argument to OpenSlide's "read_region" function. target_class: int The class annotation of the central pixel of the patch. function. min_class_area_ratio: float [0, 1] Minimum ratio of target class pixels to total pixels. polygon_type: str Type of polygon drawing: * 'line' * 'area' line_thickness: int Polygon edge line thickness. Required if ``polygon_type`` is ``line``. Returns ------- Image region or patch as PIL RGBA image plus annotation mask as 2D Numpy array. """ if not 0 <= min_class_area_ratio <= 1: raise ValueError( '"min_class_area_ratio" must be in the interval [0, 1].') coordinates, area_ratio = self._pick_random_coordinates( level, size, target_class) # When the slide level is high, the following while loop may be # infinite, since the tumor area will never be high enough. # Try to sample a few times and throw error if not successful i = 0 while area_ratio < min_class_area_ratio: i += 1 coordinates, area_ratio = self._pick_random_coordinates( level, size, target_class) if i > 15: raise ValueError( 'Cannot seem to find patch with a minimum of ' + f'{min_class_area_ratio * 100}% of class {target_class}.' + ' The chosen slide level may be too high.') # Get actual data (chosen polygon type) patch, mask = self.read_region_with_annotation( coordinates, level, size, polygon_type, line_thickness) return patch, mask