Source code for glow.wgr.ridge_reduction

# Copyright 2019 The Glow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .ridge_udfs import *
from .model_functions import _is_binary, _prepare_covariates, _prepare_labels_and_warn, _check_model
from nptyping import Float, NDArray
import pandas as pd
from pyspark.sql import DataFrame, Row
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pyspark.sql.functions as f
from typeguard import typechecked
from typing import Any, Dict, List, Union
from glow.logging import record_hls_event
import warnings

# Ignore warning to use applyInPandas instead of apply
# TODO(hhd): Remove this and start using applyInPandas once we only support Spark 3.x.
warnings.filterwarnings('ignore', category=UserWarning, message='.*applyInPandas.*')

__all__ = ['RidgeReduction']


[docs]@typechecked class RidgeReduction: """ The RidgeReduction class is intended to reduce the feature space of an N by M block matrix X to an N by P<<M block matrix. This is done by fitting K ridge models within each block of X on one or more target labels, such that a block with L columns to begin with will be reduced to a block with K columns, where each column is the prediction of one ridge model for one target label. """ def __init__(self, block_df: DataFrame, label_df: pd.DataFrame, sample_blocks: Dict[str, List[str]], cov_df: pd.DataFrame = pd.DataFrame({}), add_intercept: bool = True, alphas: List[float] = [], label_type='detect') -> None: """ Args: block_df : Spark DataFrame representing the beginning block matrix X label_df : Pandas DataFrame containing the target labels used in fitting the ridge models sample_blocks : Dict containing a mapping of sample_block ID to a list of corresponding sample IDs cov_df : Pandas DataFrame containing covariates to be included in every model in the stacking ensemble (optional). add_intercept: If True, an intercept column (all ones) will be added to the covariates (as the first column) alphas : array_like of alpha values used in the ridge reduction (optional). label_type: String to determine type treatment of labels. It can be 'detect' (default), 'binary', or 'quantitative'. """ self.block_df = block_df self.sample_blocks = sample_blocks self._label_type = label_type self.set_label_df(label_df) self.set_cov_df(cov_df, add_intercept) self.set_alphas(alphas) self.model_df = None self.reduced_block_df = None def __getstate__(self): # Copy the object's state from self.__dict__ which contains state = self.__dict__.copy() # Remove the unpicklable entries. del state['block_df'], state['model_df'], state['reduced_block_df'] return state def set_label_df(self, label_df: pd.DataFrame) -> None: self._is_binary = _is_binary(label_df) self._std_label_df = _prepare_labels_and_warn(label_df, self._is_binary, self._label_type) self._label_df = label_df def get_label_df(self) -> pd.DataFrame: return self._label_df def set_label_type(self, label_type: str) -> None: self._label_type = label_type self._std_label_df = _prepare_labels_and_warn(self._label_df, self._is_binary, label_type) def get_label_type(self) -> str: return self._label_type def set_cov_df(self, cov_df: pd.DataFrame, add_intercept: bool) -> None: self._cov_df = cov_df self._std_cov_df = _prepare_covariates(cov_df, self._label_df, add_intercept) def get_cov_df(self) -> pd.DataFrame: return self._cov_df def set_alphas(self, alphas: List[float]) -> None: self._alphas = generate_alphas( self.block_df) if len(alphas) == 0 else create_alpha_dict(alphas) def get_alphas(self) -> Dict[str, Float]: return self._alphas def is_binary(self) -> bool: return self._is_binary
[docs] def fit(self) -> DataFrame: """ Fits a ridge reducer model, represented by a Spark DataFrame containing coefficients for each of the ridge alpha parameters, for each block in the starting matrix, for each label in the target labels. Returns: Spark DataFrame containing the model resulting from the fitting routine. """ map_key_pattern = ['header_block', 'sample_block'] reduce_key_pattern = ['header_block', 'header'] if 'label' in self.block_df.columns: map_key_pattern.append('label') reduce_key_pattern.append('label') map_udf = pandas_udf( lambda key, pdf: map_normal_eqn(key, map_key_pattern, pdf, self._std_label_df, self. sample_blocks, self._std_cov_df), normal_eqn_struct, PandasUDFType.GROUPED_MAP) reduce_udf = pandas_udf(lambda key, pdf: reduce_normal_eqn(key, reduce_key_pattern, pdf), normal_eqn_struct, PandasUDFType.GROUPED_MAP) model_udf = pandas_udf( lambda key, pdf: solve_normal_eqn(key, map_key_pattern, pdf, self._std_label_df, self. _alphas, self._std_cov_df), model_struct, PandasUDFType.GROUPED_MAP) record_hls_event('wgrRidgeReduceFit') self.model_df = self.block_df.groupBy(map_key_pattern).apply(map_udf).groupBy( reduce_key_pattern).apply(reduce_udf).groupBy(map_key_pattern).apply(model_udf) return self.model_df
[docs] def transform(self) -> DataFrame: """ Transforms a starting block matrix to the reduced block matrix, using a reducer model produced by the RidgeReduction fit method. Returns: Spark DataFrame representing the reduced block matrix """ _check_model(self.model_df) transform_key_pattern = ['header_block', 'sample_block'] if 'label' in self.block_df.columns: transform_key_pattern.append('label') joined = self.block_df.drop('sort_key') \ .join(self.model_df, ['header_block', 'sample_block', 'header'], 'right') \ .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0))) else: joined = self.block_df.drop('sort_key') \ .join(self.model_df, ['header_block', 'sample_block', 'header'], 'right') transform_udf = pandas_udf( lambda key, pdf: apply_model(key, transform_key_pattern, pdf, self._std_label_df, self. sample_blocks, self._alphas, self._std_cov_df), reduced_matrix_struct, PandasUDFType.GROUPED_MAP) record_hls_event('wgrRidgeReduceTransform') self.reduced_block_df = joined.groupBy(transform_key_pattern).apply(transform_udf) return self.reduced_block_df
[docs] def fit_transform(self) -> DataFrame: """ Fits a ridge reduction model with a block matrix, then transforms the matrix using the model. Returns: Spark DataFrame representing the reduced block matrix """ self.fit() return self.transform()