Source code for privattacks.data

import os
import pyreadr
import numpy as np
import pandas as pd

[docs] class Data: """ A class for handling datasets. The supported formats are 'csv', 'rdata' and 'sas7bdat'. Parameters: file_name (str, optional): Dataset file path. cols (list, optional): Dataset columns. If not given when given file_name, read all columns in the file. cols_to_ignore (list, optional): Columns to ignore in the convertion to integers from 0 to domain_size-1. It must be used for columns with integer values only. sep_csv (str, optional): CSV delimiter, default is ",". encoding: (str, optional, default 'utf-8'): Encoding to use for UTF when reading/writing (ex. 'utf-8', 'latin1'). dataframe (pandas.DataFrame, optional): Pandas dataframe containing the dataset. matrix (numpy.ndarray, optional): Numpy 2d matrix containing the dataset. domains (dict[str, list], optional): Domain of columns. If not given, the domains will be taken from data. Keys are column names and values are lists. na_values (int, optional): Value to fill missing data (NaN) with, default is -1. Attributes: dataset (numpy.ndarray): Numpy matrix of integers. n_rows (int): Number of rows (records) in the dataset. n_cols (int): Number of columns (attributes) in the dataset. cols (list): List of column names in the dataset. The same order as the dataset matrix. domains (dict[str, list]): Column domains. Keys are column names and values are lists. To generate the numpy matrix each original value will be converted to its index in the domain's list. """ def __init__( self, file_name=None, cols=None, cols_to_ignore=None, sep_csv=",", encoding='utf-8', dataframe=None, matrix=None, domains=None, na_values=-1 ): if dataframe is not None: if not isinstance(dataframe, pd.DataFrame): raise TypeError("dataframe must be a pandas.DataFrame object") elif matrix is not None: if not isinstance(matrix, np.ndarray): raise TypeError("matrix must be a numpy.ndarray object") if cols is None: raise ValueError("cols argument not given") elif file_name is not None: file_type = self._file_extension(file_name) if file_type == ".csv" or ".zip": if sep_csv is None: raise NameError("sep_csv must be provided for csv files") dataframe = pd.read_csv(file_name, sep=sep_csv, usecols=cols, encoding=encoding) elif file_type == ".rdata": rdata = pyreadr.read_r(file_name) data = next(iter(rdata)) dataframe = rdata[data] elif file_type == ".sas7bdat": dataframe = pd.read_sas(file_name) else: raise TypeError("The only supported files are 'csv', 'rdata' and 'sas7bdat' or zip versions of them") else: raise TypeError("Either file_name or dataframe must be given") if matrix is not None: self.n_rows = matrix.shape[0] self.n_cols = matrix.shape[1] self.cols = cols else: dataframe.replace(np.nan, na_values, inplace=True) self.n_rows = dataframe.shape[0] self.n_cols = dataframe.shape[1] self.cols = dataframe.columns.to_list() self.cols_to_ignore = cols_to_ignore # If domains is not given, take the domains from the dataset if domains is not None: self.domains = domains else: self.domains = self._get_col_domains(dataframe) if matrix is not None: self.dataset = matrix.copy() else: self.dataset = self.df2np(dataframe)
[docs] def col2int(self, col) -> int: """Index of a column in the dataset numpy matrix.""" if isinstance(col, str): return self.cols.index(col) elif isinstance(col, list): return [self.cols.index(i) for i in col] else: raise ValueError("col must be a string or a list of strings.")
[docs] def np2df(self) -> pd.DataFrame: """Convert the numpy matrix to the dataset original domains. Returns df (pandas.DataFrame): Dataset with original domains. """ df = pd.DataFrame(self.dataset, columns=self.cols) cols = self.cols if self.cols_to_ignore: cols = list(set(cols) - set(self.cols_to_ignore)) for col in cols: df[col] = df[col].apply(lambda value : self.domains[col][value]) return df
[docs] def df2np(self, dataframe:pd.DataFrame) -> np.ndarray: """Converts a pandas dataframe to a numpy.ndarray. The matrix contains integers in "standard" type, i.e., for all column c, the original values from the domain of c are converted to integers from 0 to size(c). Each original value in a domain will be converted to the respective index the value is in the domain list. The method generates a numpy.ndarray. Parameters: dataframe (pandas.DataFrame): Dataset. Returns: dataset (numpy.ndarray): Dataset in standard type. """ # Create a tranposed matrix because numpy is row-oriented dataset = np.empty(dataframe.shape[::-1], dtype=int) cols = self.cols if self.cols_to_ignore: cols = list(set(cols) - set(self.cols_to_ignore)) for col in self.cols_to_ignore: col_idx = self.col2int(col) dataset[col_idx, :] = dataframe[col] for col in cols: col_idx = self.col2int(col) convert = lambda value : self.domains[col].index(value) dataset[col_idx, :] = dataframe[col].apply(convert) # Convert back the dataset to the correct orientation return dataset.T
def _get_col_domains(self, dataset): """Get columns domain from the dataset.""" if isinstance(dataset, pd.DataFrame): return {col:dataset[col].unique().tolist() for col in self.cols} elif isinstance(dataset, np.ndarray): return {col: np.unique(dataset[:, col]).tolist() for col in range(dataset.shape[1])} def _file_extension(self, file_name:str) -> str: """ Infer the file extension from a given file path. Parameters: file_name (str): The path to the file. Returns: str: The file extension in lowercase. """ _, extension = os.path.splitext(file_name) return extension.lower()