Source code for slide

# -*- coding: utf-8 -*-

"""
Handle annotated whole-slide images (WSI; also known as virtual slides).

"""

import warnings
from distutils.version import StrictVersion
import random

import numpy as np
import openslide
import cv2

import annotation as reader
import tissue


class _AnnotatedOpenSlide(openslide.OpenSlide):
    """An open *annotated* WSI.

    Wrapper of `openslide.OpenSlide` to handle annotation polygons together
    with their corresponding OpenSlide Python WSI instances.

    Parameters
    ----------
    filename: str
        Path to slide file.
    annotation_filename: str
        Path to XML annotation file.
    xml_style: str
        The annotation XML style. Typically associated with a computational
        histology challenge releasing the dataset.
        * 'asap': CAMELYON grand challenges in pathology.
        * 'bach': BACH Grand Challenge on Breast Cancer Histology Images.

    Attributes
    ----------
    filename: str
        The WSI file name to read.
    annotation_filename: str
        Corresponding XML annotation file.
    polygons: list of lists of 2-tuples
        Polygon region annotations.
    labels: list of str
        Polygon region annotation labels.
    label_map: dict {str: int}
        Correspondence between polygon region annotation labels and integer
        values used in numpy masks.

    """

    def __init__(self, filename, annotation_filename, xml_style):
        openslide.OpenSlide.__init__(self, filename)
        self.filename = filename  # Useful to name predicted annotations
        self.annotation_filename = annotation_filename
        self.xml_style = xml_style
        self.polygons = None
        self.label_map = None

        # Label annotations as:
        #   1: Normal tissue
        #   2: Tumor tissue
        # 0 is automatically assigned to all unannotated pixels
        if self.annotation_filename is not None:
            if self.xml_style == 'asap':
                self.polygons, self.labels = reader.asap_annotations(
                    self.annotation_filename)
                # CAMELYON17 data
                if 'metastases' in self.labels:
                    # Avoid value 0 (used by default for unlabeled regions)
                    self.label_map = {'metastases': 2, 'normal': 1}
                # CAMELYON16 data
                elif '_0' in self.labels or '_1' in self.labels:
                    self.label_map = {'_0': 2, '_1': 2, '_2': 1}
                elif 'Tumor' in self.labels or 'Exclusion' in self.labels:
                    self.label_map = {'Tumor': 2, 'Exclusion': 1}
                else:  # Predicted annotations
                    self.label_map = {'predicted_tumor': 1}
            elif self.xml_style == 'bach':
                # Value 1 is reserved for 'normal' tissue annotations
                self.label_map = {'Benign': 2, 'Carcinoma in situ': 3,
                                  'Invasive carcinoma': 4}
                self.polygons, self.labels = reader.bach_annotations(
                    self.annotation_filename)
            else:
                raise ValueError(
                    '"xml_style" value must be either "asap" or "bach".')
        else:
            if self.xml_style is not None:
                warnings.warn('"xml_style" is only used if an ' +
                              '"annotation_filename" is provided.')


[docs]class Slide(_AnnotatedOpenSlide):
    """An open WSI, with or without annotations.

    Added functionality to OpenSlide WSI instances, optionally with
    annotations.

    Parameters
    ----------
    filename: str
        Path to slide file. Namesake argument to `openslide.OpenSlide` and
        `AnnotatedOpenSlide`.
    annotation_filename: str
        Path to XML annotation file. Namesake argument to `AnnotatedOpenSlide`.
    xml_style: str
        The computational histology challenge releasing the dataset. Namesake
        argument to `AnnotatedOpenSlide`.

    Attributes
    ----------
    tissue_mask: Numpy 2D array
        Tissue RoI ``annotation``.
    tissue_label_map: dict {str: int}
        Correspondence between polygon region annotation labels and integer
        values used in numpy masks.
    downsampling_factor: float
        The scaling factor relative to level 0 dimensions.
    downsampled_slide: PIL image
        The scaled down slide generated when running the tissue detector.

    Examples
    --------
    >>> from wsipre import slide
    >>> wsi = slide.Slide('tumor_001.tif', 'tumor_001.xml', 'asap')
    >>> wsi.label_map
    {'_0': 2, '_1': 2, '_2': 1}
    >>> wsi.level_count
    10
    >>> len(wsi.polygons)
    2

    >>> thumbnail, mask, dwnspl_factor = wsi.get_thumbnail_with_annotation(
    ...     size=(3000, 3000), polygon_type='line', line_thickness=5)

    >>> slide_region = wsi.read_region_with_annotation(
    ...     location=(65000, 110000), level=4, size=(1000, 1000))

    """

    def __init__(self, filename, annotation_filename=None, xml_style=None):
        self.tissue_mask = None
        self.tissue_label_map = None
        self.downsampling_factor = None
        self.downsampled_slide = None

        # OpenSlide version < 3.4.1 lacks support for some recent file formats
        version = openslide.__library_version__
        if StrictVersion(version) < StrictVersion('3.4.1'):
            warnings.warn(
                f'OpenSlide version {version} lacks support for some ' +
                'whole-slide image file formats. It is highly recommended ' +
                'to update to a more recent version.')

        if annotation_filename is None:
            if xml_style is not None:
                warnings.warn(
                    '"xml_style" is not used (no annotation was provided).')
            openslide.OpenSlide.__init__(self, filename)
        else:
            _AnnotatedOpenSlide.__init__(
                self, filename, annotation_filename, xml_style)

    def _draw_polygons(self, mask, polygons, polygon_type,
                       line_thickness=None):
        """Convert polygon vertex coordinates to polygon drawing."""
        for poly, label in zip(polygons, self.labels):
            if polygon_type == 'line':
                mask = cv2.polylines(mask, [poly], True,
                                     int(self.label_map[label]),
                                     line_thickness)
            elif polygon_type == 'area':
                if line_thickness is not None:
                    warnings.warn('"line_thickness" is only used if ' +
                                  '"polygon_type" is "line".')

                mask = cv2.fillPoly(mask, [poly], int(self.label_map[label]))
            else:
                raise ValueError(
                    'Accepted "polygon_type" values are "line" or "area".')

        return mask

    def _draw_tissue_polygons(self, mask, polygons, polygon_type,
                              line_thickness=None):
        """Convert tissue polygon vertex coordinates to polygon drawing."""
        tissue_label = 1

        for poly in polygons:
            if polygon_type == 'line':
                mask = cv2.polylines(
                    mask, [poly], True, tissue_label, line_thickness)
            elif polygon_type == 'area':
                if line_thickness is not None:
                    warnings.warn('"line_thickness" is only used if ' +
                                  '"polygon_type" is "line".')

                mask = cv2.fillPoly(mask, [poly], tissue_label)
            else:
                raise ValueError(
                    'Accepted "polygon_type" values are "line" or "area".')

        return mask

[docs]    def get_thumbnail_with_annotation(self, size, polygon_type='area',
                                      line_thickness=None):
        """Convert *annotated* WSI to a scaled-down thumbnail.

        Parameters
        ----------
        size: 2-tuple
            Maximum size of compressed image thumbnail as (width, height).
        polygon_type: str
            Type of polygon drawing:
            * 'line'
            * 'area'
        line_thickness: int
            Polygon edge line thickness. Required if ``polygon_type`` is
            ``line``.

        Returns
        -------
        Scaled down slide (as PIL RGBA image) and annotation mask (as numpy 2D
        array), plus the scaling factor.

        """
        if self.polygons is None:
                raise ValueError(
                    'No annotation is available. Please load an annotation ' +
                    'or consider using "get_thumbnail" instead.')

        thumb = self.get_thumbnail(size)

        width_factor = self.dimensions[0] / size[0]
        height_factor = self.dimensions[1] / size[1]
        downsampling_factor = max(width_factor, height_factor)

        polygons = np.array([np.array(poly) for poly in self.polygons])
        polygons = polygons / downsampling_factor
        polygons = [poly.astype('int32') for poly in polygons]

        mask = np.zeros(thumb.size[::-1])  # Reverse width and height
        mask = self._draw_polygons(
            mask, polygons, polygon_type, line_thickness)

        return thumb, mask, downsampling_factor

[docs]    def read_region_with_annotation(self, location, level, size,
                                    polygon_type='area', line_thickness=None):
        """Crop a smaller region from an *annotated* WSI.

        Get a defined region from a WSI, together with its annotation mask.

        Parameters
        ----------
        location: 2-tuple
            X and y coordinates of the top left pixel. Namesake argument to
            OpenSlide's "read_region" function.
        level: int
            Slide level. Namesake argument to OpenSlide's "read_region"
            function.
        size: 2-tuple
            Crop size (width, height). Namesake argument to OpenSlide's
            "read_region" function.
        polygon_type: str
            Type of polygon drawing:
            * 'line'
            * 'area'
        line_thickness: int
            Polygon edge line thickness. Required if 'polygon_type' is 'line'.

        Returns
        -------
        Image region (as PIL RGBA image) and the annotation mask (as numpy 2d
        array).

        """
        if self.polygons is None:
                raise ValueError(
                    'No annotation is available. Please load an annotation ' +
                    'or consider using "read_region" instead.')
        downsampling_factor = self.level_downsamples[level]
        slide_region = self.read_region(location, level, size)

        polygons = np.array([np.array(poly) for poly in self.polygons])
        polygons = polygons / downsampling_factor
        polygons = [poly.astype('int32') for poly in polygons]

        location = tuple(int(coord / downsampling_factor)
                         for coord in location)

        # Convert polygon coordinates to patch coordinates
        polygons = [poly - np.array(location) for poly in polygons]

        mask_region = np.zeros(size[::-1])
        mask_region = self._draw_polygons(
           mask_region, polygons, polygon_type, line_thickness)

        return slide_region, mask_region

[docs]    def get_tissue_mask(self, downsampling_factor=64, polygon_type='area',
                        line_thickness=None):
        """Generate tissue region annotation.

        Make a binary mask annotating tissue regions of interest (RoI) on the
        WSI using an automatic threshold-based segmentation method inspired by
        the one used by Wang *et al.* [1]_. Briefly, the method consists on the
        following steps, starting from a WSI:
            * Select downsampling level (typically a factor of 64)
            * Transfer from the RGB to the HSV color space
            * Determine optimal threshold value in the saturation channel using
              the Otsu algorithm [2]_
            * Threshold image to generate a binary mask
            * Fill in small holes and remove small objects

        .. [1] Dayong Wang, Aditya Khosla, Rishab Gargeya, Humayun Irshad,
           Andrew H. Beck, "Deep Learning for Identifying Metastatic Breast
           Cancer", arXiv:1606.05718.

        .. [2] Nobuyuki Otsu, "A Threshold Selection Method from Gray-level
           Histograms", IEEE Trans Syst Man Cybern., 9(1):62–66, 1979.

        Parameters
        ----------
        downsampling_factor: int
            The desired factor to downsample the image by, since full WSIs will
            not fit in memory. The image's closest level downsample is found
            and used.
        polygon_type: str
            Type of polygon drawing:
            * 'line'
            * 'area'
        line_thickness: int
            Polygon edge line thickness. Required if ``polygon_type`` is
            ``line``.

        Returns
        -------
        Binary mask as numpy 2D array, RGB slide image (in the used
        downsampling level, to allow visualization) and downsampling factor.

        """
        (mask_contours,
         self.downsampled_slide,
         self.downsampling_factor) = tissue.detect_tissue(
             self, downsampling_factor)

        mask = np.zeros(self.downsampled_slide.shape[:2])
        self.tissue_mask = self._draw_tissue_polygons(
           mask, mask_contours, polygon_type, line_thickness)
        self.tissue_label_map = {'background': 0, 'tissue': 1}

        return self

    def _get_random_coordinates(self, pixels_in_roi, level,
                                mask_downsampling_factor, size):
        """Get region (patch) coordinates at random."""
        coordinates = random.choice(pixels_in_roi)

        # Scale coordinates up to level-0 dimensions
        coordinates = [int(x * mask_downsampling_factor) for x in coordinates]

        # OpenSlide Python's read_region takes top left corner position, which
        # effectively excludes tissue above and to the left of annotations
        # Fix it by offsetting coordinates: convert to center pixel position
        # (subtract half of patch size from x and y coordinates)
        half_size = [x * self.level_downsamples[level] / 2 for x in size]
        row_location = coordinates[0] - half_size[0]
        col_location = coordinates[1] - half_size[1]

        # Make sure coordinates are still within slide margins
        cols_edge = self.dimensions[0] - size[0]
        rows_edge = self.dimensions[1] - size[1]
        row_location = max(0, min(row_location, rows_edge))
        col_location = max(0, min(col_location, cols_edge))

        return int(col_location), int(row_location)  # OpenSlide: width, height

    def _get_pixels_avoiding_labels(self, avoid_labels):
        # Can only do this if an annotation is available
        if self.polygons is None:
            raise ValueError('No annotation is available.')

        # Generate annotated thumbnail with exact dimensions of tissue mask
        rows, cols = self.tissue_mask.shape
        _, mask, _ = self.get_thumbnail_with_annotation(
            size=(cols, rows), polygon_type='area')

        # Locate pixels in tissue and not in "avoid_labels"
        new_mask = np.where(self.tissue_mask == 1, 1, mask)
        new_mask = np.where(np.isin(mask, avoid_labels), 2, new_mask)

        return tuple(zip(*np.where(new_mask == 1)))

    def _max_repeated_pixel_ratio(self, image):
        """Compute ratio of count of most common pixel value to total count."""
        image = np.array(image)
        _, counts = np.unique(image, return_counts=True)

        return np.max(counts) / image.size

    def _get_area_ratio(self, mask, target_class):
        """Compute ratio of pixels labeled as 'target_class' to all pixels."""
        class_pixels = mask[np.where(mask == target_class)].size

        return class_pixels / mask.size

[docs]    def read_random_tissue_patch(self, level, size, avoid_labels=None):
        """Crop random patch from detected tissue on WSI.

        Parameters
        ----------
        level: int
            Slide level. Namesake argument to OpenSlide's "read_region"
            function.
        size: 2-tuple
            Crop size (width, height). Namesake argument to OpenSlide's
            "read_region" function.
        avoid_labels: iterable
            Labels from provided annotation to avoid when targeting tissue.
            An annotation is must be available.

        Returns
        -------
        Image region or patch as PIL RGBA image (plus annotation mask as 2D
        Numpy array if labels to avoid are provided).

        """
        if self.tissue_mask is None:
            self.get_tissue_mask()

        if avoid_labels is None:
            # Select coordinates matching tissue (labeled as 1)
            pixels_in_roi = tuple(zip(*np.where(self.tissue_mask == 1)))
        else:
            # Select coordinates matching tissue (labeled as 1) and avoiding
            # indicated labels
            # Can only do this if an annotation is available
            if self.polygons is None:
                raise ValueError('No annotation is available.')

            # Generate annotated thumbnail with exact dimensions of tissue mask
            rows, cols = self.tissue_mask.shape
            _, mask, _ = self.get_thumbnail_with_annotation(
                size=(cols, rows), polygon_type='area')

            # Locate pixels in tissue and not in "avoid_labels"
            new_mask = np.where(self.tissue_mask == 1, 1, mask)
            new_mask = np.where(np.isin(mask, avoid_labels), 2, new_mask)

            pixels_in_roi = tuple(zip(*np.where(new_mask == 1)))

        coordinates = self._get_random_coordinates(
            pixels_in_roi, level, self.downsampling_factor, size)
        patch = self.read_region(coordinates, level, size)

        # Avoid false positive regions where a large proportion of pixels is
        # exactly the same
        while self._max_repeated_pixel_ratio(patch) > 0.5:
            coordinates = self._get_random_coordinates(
                pixels_in_roi, level, self.downsampling_factor, size)
            patch = self.read_region(coordinates, level, size)

        # Also, make sure the patch is mostly normal tissue
        # (or backgound, not actually excluding it here...)
        if avoid_labels is not None:
            patch, mask = self.read_region_with_annotation(
                coordinates, level, size, polygon_type='area')
            avoid_ratios = [self._get_area_ratio(mask, x)
                            for x in avoid_labels]
            i = 0
            while any(x > 0.1 for x in avoid_ratios):
                i += 1
                coordinates = self._get_random_coordinates(
                    pixels_in_roi, level, self.downsampling_factor, size)
                patch, mask = self.read_region_with_annotation(
                    coordinates, level, size, polygon_type='area')
                avoid_ratios = [self._get_area_ratio(mask, x)
                                for x in avoid_labels]
                if i > 15:
                    raise ValueError(
                        'Cannot seem to find patch matching requirements.')

            return patch, mask

        return patch

    def _pick_random_coordinates(self, level, size, target_class):
        """Pick random patch to crop from WSI."""
        # Get thumbnail and select coordinates at random
        _, mask, downsampling_factor = self.get_thumbnail_with_annotation(
             size=(5000, 5000), polygon_type='area')

        # Select coordinates matching target class
        pixels_in_roi = tuple(zip(*np.where(mask == target_class)))

        coordinates = self._get_random_coordinates(
            pixels_in_roi, level, downsampling_factor, size)

        # Get ratio (polygon type must be 'area')
        _, mask = self.read_region_with_annotation(
            coordinates, level, size, polygon_type='area')
        area_ratio = self._get_area_ratio(mask, target_class)

        return coordinates, area_ratio

[docs]    def read_random_patch(self, level, size, target_class,
                          min_class_area_ratio, polygon_type='area',
                          line_thickness=None):
        """Crop random patch from WSI.

        Select random location within target class according to provided
        annotation.

        Parameters
        ----------
        level: int
            Slide level. Namesake argument to OpenSlide's "read_region"
            function.
        size: 2-tuple
            Crop size (width, height). Namesake argument to OpenSlide's
            "read_region" function.
        target_class: int
            The class annotation of the central pixel of the patch.
            function.
        min_class_area_ratio: float [0, 1]
            Minimum ratio of target class pixels to total pixels.
        polygon_type: str
            Type of polygon drawing:
            * 'line'
            * 'area'
        line_thickness: int
            Polygon edge line thickness. Required if ``polygon_type`` is
            ``line``.

        Returns
        -------
        Image region or patch as PIL RGBA image plus annotation mask as 2D
        Numpy array.

        """
        if not 0 <= min_class_area_ratio <= 1:
            raise ValueError(
                '"min_class_area_ratio" must be in the interval [0, 1].')

        coordinates, area_ratio = self._pick_random_coordinates(
            level, size, target_class)

        # When the slide level is high, the following while loop may be
        # infinite, since the tumor area will never be high enough.
        # Try to sample a few times and throw error if not successful
        i = 0
        while area_ratio < min_class_area_ratio:
            i += 1
            coordinates, area_ratio = self._pick_random_coordinates(
                level, size, target_class)
            if i > 15:
                raise ValueError(
                    'Cannot seem to find patch with a minimum of ' +
                    f'{min_class_area_ratio * 100}% of class {target_class}.' +
                    ' The chosen slide level may be too high.')

        # Get actual data (chosen polygon type)
        patch, mask = self.read_region_with_annotation(
            coordinates, level, size, polygon_type, line_thickness)

        return patch, mask