Source code for preprocess

# Global modules
import collections
import math
import time

# Internal modules
from utils import *

[docs]class PreProcessor: """ The preprocessor class that handles binarization and discretization of dataset """ def __init__(self): self.transactions = [] self.unique = collections.OrderedDict() self.trans_count = 0 self.mapper = PreProcessor.Mapper()
[docs] def get_transactions(self): """ Getter method for transactions list of OrderedDict :return: Transactions list (list(OrderedDict)) """ return self.transactions
[docs] def get_uniques(self): """ Getter method for unique itemsets (dictionary) :return: Unique itemsets (dict), key: items, value: counts """ return self.unique
[docs] def get_transaction_count(self): """ Getter method for transaction count after parsing the file :return: Transaction count (int) """ return self.trans_count
[docs] def parse_file(self, file): """ The main function to parse the file and run the preprocesser methods :param file: Filepath of the data :return: Returns number of the transaction parsed (int) """ print("Preprocess begin to parse the file") start_t = time.clock() mp = self.mapper with open(file, "r") as f: for line in f: chars = str(line) # Get all the necessary fields here sex = self.get_field(chars, mp.sex) race = self.get_field(chars, mp.race) score = self.get_field(chars, mp.score) # lang_native=self.get_field(chars,mp.lang_native) fam_comp=self.get_field(chars,mp.fam_comp) par_edu=self.get_field(chars,mp.par_edu) income=self.get_field(chars,mp.income) s_expect=self.get_field(chars,mp.s_expect) control=self.get_field(chars,mp.control) sc_loc=self.get_field(chars,mp.sc_loc) # fight=self.get_field(chars,mp.fight) late = self.get_field(chars, mp.late) homework = self.get_field(chars, mp.homework) sh_accomp = self.get_field(chars, mp.sh_accomp) sh_poorp = self.get_field(chars, mp.sh_poorp) good_grec = self.get_field(chars, mp.good_grec) likes_s = self.get_field(chars, mp.likes_s) library = self.get_field(chars, mp.library) # Add this into transaction. Put all the fields into the list fields = [sex, race, score, fam_comp, par_edu,income, s_expect, control, sc_loc, late, homework, sh_accomp, sh_poorp, good_grec, likes_s, library] self.add_transaction(fields) # Performance measurements total_t = str(format(time.clock() - start_t, '.4f')) print("Preprocessing took {:>10} seconds" .format(total_t)) # Return number of transactions added return self.trans_count
[docs] def get_field(self, line, mapper): """ Selects the appropriate preprocessing method according to line and mapper structure :param line: New line of data (str) :param mapper: Corresponding mapper structure of the field :return: Preprocessed field (str) """ value = line[mapper['STR'] - 1: mapper['END']] # Check the type and execute either discretize/binarize etc. if mapper['TYPE'] == 'BINARY': return self._is_name(mapper['COL'], mapper['VALS'][int(value)]) elif mapper['TYPE'] == 'CATEGORICAL': return self.binarize(mapper, int(value)) # Change here later for OTHER field else: return self.discretize(mapper, float(value))
[docs] def add_transaction(self, fields): """ Adds the preprocessed fields into the transaction list :param fields: Preprocessed fields (list) :return: """ self.trans_count += 1 items = collections.OrderedDict() # Add the fields here, True is only used to have # OrderedSet kind of data structure items = collections.OrderedDict({f: True for f in fields}) # more pythonic way to populate self.count_unique(fields) # Updates unique dict # Use keys to sort the dict items = collections.OrderedDict(sorted(items.items(), key=lambda _: _[0])) t = {'ID': self.trans_count, 'ITEMS': items} self.transactions.append(t)
[docs] def count_unique(self, fields): """ Updates the count of unique fields :param fields: List of fields :return: """ for f in fields: if f not in self.unique: self.unique[f] = 1 else: self.unique[f] += 1 self.unique = collections.OrderedDict(sorted(self.unique.items(), key=lambda _: _[0]))
[docs] def discretize(self, mapper, col_data): """ Used to discretize the continous values from the given mapper and value :param mapper: Mapper of the continious field (Mapper Class) :param col_data: Value of the continous field (float) :return: Returns discretized name of the field (string) """ max = math.ceil(mapper['MAX']) min = math.floor(mapper['MIN']) interval = mapper['INTERVAL'] step = (max - min) / interval # Initial check to decide in which range it belongs to lower = float(format(min, '.2f')) upper = float(format(lower + step, '.2f')) if col_data >= lower and col_data <= upper: str_interval = '[' + str(int(lower)) + '-' + str(int(upper)) + ']' # print('Lower : ' + str(lower) + ' Upper : ' + str(upper) # + ' Value : ' + str(col_data) + ' Interval : ' + str_interval) return self._is_name(mapper['COL'], str_interval) # Check the boundries until the end of the interval value for i in range(1, interval): lower = float(format(upper, '.2f')) upper = float(format(upper + step, '.2f')) if col_data >= lower and col_data <= upper: str_interval = '[' + str(int(lower)) + '-' + str(int(upper)) + ']' # print('Lower : ' + str(lower) + ' Upper : ' + str(upper) # + ' Value : ' + str(col_data) + ' Interval : ' + str_interval) return self._is_name(mapper['COL'], str_interval) raise ValueError('Value is not between the intervals check preprocessor::discretize')
[docs] def binarize(self, mapper, col_data): """ Binarize the attribute data using mapper :param col_data: Categorical data assumed to be between -9 and 25 :param mapper: Corresponding mapper of this field :return: Binarized field (str) """ if mapper is None: print("Give an appropriate mapper") return if col_data < -9: raise ValueError('Values cannot be less than -9 - check binarize method in preprocess.py') # Change this, in case if we break something max_categorical_value = 25 if col_data > max_categorical_value: raise ValueError('Values cannot be more than 25 - check binarize method in preprocess.py') # Return proper COL_IS_ATTR name if col_data in mapper['VALS'].keys(): if mapper['OTHERS'] is not None: if col_data in mapper['OTHERS']: # Others nm = self._is_name(col=mapper['COL'], attr="OTHERS") elif col_data in mapper['VALS']: # Map as standalone field nm = self._is_name(col=mapper['COL'], attr=mapper['VALS'][col_data]) else: raise Exception("Something unusual in preprocessor::binarize happened") else: # Map as standalone field nm = self._is_name(col=mapper['COL'], attr=mapper['VALS'][col_data]) # Return the name return nm else: raise ValueError('This key is not inside our mapper VALS - check binarize method in preprocess.py')
[docs] def save_transactions(self, path = "transactions.csv"): """ Save the preprocessed transactions into a file :param path: Path to be saved :return: Returns true on successful save """ print('Saving the transactions into {}'.format(path)) start_t = time.clock() with open(path, 'w') as f: f.write("ID,ITEMS\n") for t in self.transactions: print_str = str(t['ID']) for i in t['ITEMS'].keys(): print_str += "," + i print_str += "\n" f.write(print_str) # Performance measurements total_t = str(format(time.clock() - start_t, '.4f')) print("Save procedure took {:>10} seconds" .format(total_t)) return True
def _print_transactions(self): """ Used to print transactions in csv format :return: """ print_str = "ID,ITEMS\n" for t in self.transactions: print_str += str(t['ID']) for i in t['ITEMS'].keys(): print_str += "," + i print_str += "\n" print(print_str) def _is_name(self, col, attr): """ Used to construct the itemset name with combination of column and attiribute :param col: Column name of the data (str) :param attr: Attribute name of the data (str) :return: Itemset name (str) """ return col.upper() + "_IS_" + attr.upper() # Until getting nice representation using files(possibly JSON) use this structure # later we can create the file structure and parser for that.
[docs] class Mapper: """ Used to map the fields """ def __init__(self): # Some fields can change self.sex = {'COL': 'SEX', 'TYPE': 'BINARY', 'STR': 24, 'END': 25, 'OTHERS': None, 'VALS': {1: 'MALE', 2: 'FEMALE'}} # Combine fields that are in others together self.race = {'COL': 'RACE', 'TYPE': 'CATEGORICAL', 'STR': 26, 'END': 27, 'OTHERS': {4: 'HISP_NR', 5: 'HISP_RC', 3: 'BLACK'}, 'VALS': {1: 'AMER', 2: 'ASIA', 3: 'BLACK', 4: 'HISP_NR', 5: 'HISP_RC', 6: 'MULT', 7: 'WHITE'}} # SCORE_IS-20_60 , 35.12 self.score = {'COL': 'SCORE', 'TYPE': 'CONTINIOUS', 'STR': 106, 'END': 111, 'MIN': 20.91, 'MAX': 81.04, 'INTERVAL': 5} # Whether English is student's native language-composite # self.lang_native = {'COL': 'ENG_LANG_NATIVE', 'TYPE': 'BINARY', 'STR': 28, 'END': 29, # 'OTHERS': None, # 'VALS': {0: 'NO', 1: 'YES'}} # Family composition self.fam_comp = {'COL': 'FAM', 'TYPE': 'CATEGORICAL', 'STR': 42, 'END': 43, 'OTHERS':{4: 'GG', 5: 'M', 6: 'F', 7: 'FEG', 8: 'MAG',9: 'HALFTIME'}, 'VALS': {1: 'MF', 2: 'MG', 3: 'FG', 4: 'GG', 5: 'M', 6: 'F', 7: 'FEG', 8: 'MAG',9: 'HALFTIME'}} # Parents' highest level of education self.par_edu = {'COL': 'PAR_EDU', 'TYPE': 'CATEGORICAL', 'STR': 44, 'END': 45, 'OTHERS': None, 'VALS': {1: 'UHS', 2: 'HS', 3: 'US', 4: 'S', 5: 'UC', 6: 'C', 7: 'M', 8: 'PHD'}} # Total family income from all sources 2001-composite self.income = {'COL': 'INCOME', 'TYPE': 'CATEGORICAL', 'STR': 54, 'END': 55, 'OTHERS': {1: 'NONE', 2: '0-1K', 3: '1-5K'}, 'VALS': {1: 'NONE', 2: '0-1K', 3: '1-5K', 4: '5-10K', 5: '10-15K', 6: '15-20K', 7: '20-25K', 8: '25-35K', 9: '35-50K', 10: '50-75K', 11: '75-100K',12: '100-200K', 13: '200K-more'}} # How far in school student thinks will get-composite self.s_expect = {'COL': 'S_EXPEC', 'TYPE': 'CATEGORICAL', 'STR': 72, 'END': 73, 'OTHERS': None, 'VALS': {-1:'UK', 1: 'UHS', 2: 'HS', 3: 'S', 4: 'UC', 5: 'C', 6: 'M', 7: 'PHD'}} # School control self.control = {'COL': 'SC_CTRL', 'TYPE': 'CATEGORICAL', 'STR': 253, 'END': 253, 'OTHERS': None, 'VALS': {1: 'PUB', 2: 'CAT', 3: 'PRI'}} # School urbanicity self.sc_loc = {'COL': 'SC_LOC', 'TYPE': 'CATEGORICAL', 'STR': 254, 'END': 254, 'OTHERS': None, 'VALS': {1: 'UR', 2: 'SUB', 3: 'RU'}} # Got into a physical fight at school # self.fight = {'COL': 'FIGHT', 'TYPE': 'CATEGORICAL', 'STR': 336, 'END': 337, # 'OTHERS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP'}, # 'VALS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP', # 1: 'NEVER', 2: '1-2', 3: '2-MORE'}} # Got into a physical fight at school self.late = {'COL': 'LATE', 'TYPE': 'CATEGORICAL', 'STR': 358, 'END': 359, 'OTHERS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP'}, 'VALS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP', 1: 'NEVER', 2: '1-2', 3: '3-6', 4: '7-9', 5: '10-more'}} # How often student completes homework self.homework = {'COL': 'HOMEWORK', 'TYPE': 'CATEGORICAL', 'STR': 1610, 'END': 1611, 'OTHERS': {-9: 'MISSING', -6: 'MUL_RESP', -4: 'NO_ASW', -3: 'SKIP_ANS',-1 : 'DONT_K'}, 'VALS': {-9: 'MISSING', -6: 'MUL_RESP', -4: 'NO_ASW', -3: 'SKIP_ANS',-1 : 'DONT_K', 1: 'NEVER', 2: 'RARELY', 3: 'SOMET', 4: 'MOSTT', 5: 'ALLT'}} # Spoke to parents about accomplishments (English) self.sh_accomp = {'COL': 'SH_ACCOMP', 'TYPE': 'CATEGORICAL', 'STR': 1590, 'END': 1591, 'OTHERS': {-9: 'MISSING', -4: 'NO_ASW', -3: 'SKIP_ANS'}, 'VALS': {-9: 'MISSING', -4: 'NO_ASW', -3: 'SKIP_ANS', 0: 'NO', 1: 'YES'}} #Spoke to parents about poor performance (English) self.sh_poorp = {'COL': 'SH_POORP', 'TYPE': 'CATEGORICAL', 'STR': 1582, 'END': 1583, 'OTHERS': {-9: 'MISSING', -6: 'MUL_RESP', -4: 'NO_ASW', -3: 'SKIP_ANS'}, 'VALS': {-9: 'MISSING', -6: 'MUL_RESP', -4: 'NO_ASW', -3: 'SKIP_ANS', 0: 'NO', 1: 'YES'}} # Recognized for good grades self.good_grec = {'COL': 'RECOG', 'TYPE': 'CATEGORICAL', 'STR': 350, 'END': 351, 'OTHERS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP'}, 'VALS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP', 0: 'NO', 1: 'YES'}} # How much likes school self.likes_s = {'COL': 'LIKES_S', 'TYPE': 'CATEGORICAL', 'STR': 428, 'END': 429, 'OTHERS': {-9: 'MISSING', -6: 'MUL_RESP',-1 : 'DONT_K'}, 'VALS': {-9: 'MISSING', -6: 'MUL_RESP', -1 : 'DONT_K', 1: 'NO', 2: 'SOME', 3: 'YES'}} # Use of school library for assignments self.library = {'COL': 'LIBRARY', 'TYPE': 'CATEGORICAL', 'STR': 708, 'END': 709, 'OTHERS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP', -3: 'SKIP_ANS'}, 'VALS': {-9: 'MISSING', -6: 'MUL_RESP', -1: 'DONT_K', -3: 'SKIP_ANS', -7: 'NOT_INTERV', 1: 'NEVER', 2: 'RARELY', 3: 'SOMET', 4: 'OFTEN'}}