Source code for preprocess

# Global modules
import collections
import math
import time

# Internal modules
from utils import *

[docs]class PreProcessor:
    """
    The preprocessor class that handles binarization and discretization of dataset
    """
    def __init__(self):
        self.transactions = []
        self.unique = collections.OrderedDict()
        self.trans_count = 0
        self.mapper = PreProcessor.Mapper()

[docs]    def get_transactions(self):
        """
        Getter method for transactions list of OrderedDict

        :return: Transactions list (list(OrderedDict))
        """
        return self.transactions

[docs]    def get_uniques(self):
        """
        Getter method for unique itemsets (dictionary)

        :return: Unique itemsets (dict), key: items, value: counts
        """
        return self.unique

[docs]    def get_transaction_count(self):
        """
        Getter method for transaction count after parsing the file

        :return: Transaction count (int)
        """
        return self.trans_count

[docs]    def parse_file(self, file):
        """ The main function to parse the file and run the preprocesser methods

        :param file: Filepath of the data
        :return: Returns number of the transaction parsed (int)
        """
        print("Preprocess begin to parse the file")
        start_t = time.clock()
        mp = self.mapper
        with open(file, "r") as f:
            for line in f:
                chars = str(line)
                # Get all the necessary fields here
                sex = self.get_field(chars, mp.sex)
                race = self.get_field(chars, mp.race)
                score = self.get_field(chars, mp.score)
                # lang_native=self.get_field(chars,mp.lang_native)
                fam_comp=self.get_field(chars,mp.fam_comp)
                par_edu=self.get_field(chars,mp.par_edu)
                income=self.get_field(chars,mp.income)
                s_expect=self.get_field(chars,mp.s_expect)
                control=self.get_field(chars,mp.control)
                sc_loc=self.get_field(chars,mp.sc_loc)
                # fight=self.get_field(chars,mp.fight)
                late = self.get_field(chars, mp.late)
                homework = self.get_field(chars, mp.homework)
                sh_accomp = self.get_field(chars, mp.sh_accomp)
                sh_poorp = self.get_field(chars, mp.sh_poorp)
                good_grec = self.get_field(chars, mp.good_grec)
                likes_s = self.get_field(chars, mp.likes_s)
                library = self.get_field(chars, mp.library)

                # Add this into transaction. Put all the fields into the list
                fields = [sex, race, score, fam_comp,
                          par_edu,income, s_expect, control, sc_loc,
                          late, homework, sh_accomp, sh_poorp,
                          good_grec, likes_s, library]

                self.add_transaction(fields)
        # Performance measurements
        total_t = str(format(time.clock() - start_t, '.4f'))
        print("Preprocessing took {:>10} seconds"
              .format(total_t))
        # Return number of transactions added
        return self.trans_count

[docs]    def get_field(self, line, mapper):
        """
        Selects the appropriate preprocessing method according to line and mapper structure

        :param line: New line of data (str)
        :param mapper: Corresponding mapper structure of the field
        :return: Preprocessed field (str)
        """
        value = line[mapper['STR'] - 1: mapper['END']]
        # Check the type and execute either discretize/binarize etc.
        if mapper['TYPE'] == 'BINARY':
            return self._is_name(mapper['COL'], mapper['VALS'][int(value)])
        elif mapper['TYPE'] == 'CATEGORICAL':
            return self.binarize(mapper, int(value))  # Change here later for OTHER field
        else:
            return self.discretize(mapper, float(value))

[docs]    def add_transaction(self, fields):
        """
        Adds the preprocessed fields into the transaction list

        :param fields: Preprocessed fields (list)
        :return:
        """
        self.trans_count += 1
        items = collections.OrderedDict()
        # Add the fields here, True is only used to have
        # OrderedSet kind of data structure
        items = collections.OrderedDict({f: True for f in fields})  # more pythonic way to populate
        self.count_unique(fields)  # Updates unique dict
        # Use keys to sort the dict
        items = collections.OrderedDict(sorted(items.items(), key=lambda _: _[0]))
        t = {'ID': self.trans_count, 'ITEMS': items}
        self.transactions.append(t)

[docs]    def count_unique(self, fields):
        """
        Updates the count of unique fields

        :param fields: List of fields
        :return:
        """
        for f in fields:
            if f not in self.unique:
                self.unique[f] = 1
            else:
                self.unique[f] += 1
        self.unique = collections.OrderedDict(sorted(self.unique.items(), key=lambda _: _[0]))

[docs]    def discretize(self, mapper, col_data):
        """
        Used to discretize the continous values from the given mapper and value

        :param mapper: Mapper of the continious field  (Mapper Class)
        :param col_data: Value of the continous field (float)
        :return: Returns discretized name of the field (string)
        """
        max = math.ceil(mapper['MAX'])
        min = math.floor(mapper['MIN'])
        interval = mapper['INTERVAL']
        step = (max - min) / interval
        # Initial check to decide in which range it belongs to
        lower = float(format(min, '.2f'))
        upper = float(format(lower + step, '.2f'))
        if col_data >= lower and col_data <= upper:
            str_interval = '[' + str(int(lower)) + '-' + str(int(upper)) + ']'
            # print('Lower : ' + str(lower) + ' Upper : ' + str(upper)
            #       + ' Value : ' + str(col_data) + ' Interval : ' + str_interval)
            return self._is_name(mapper['COL'], str_interval)

        # Check the boundries until the end of the interval value
        for i in range(1, interval):
            lower = float(format(upper, '.2f'))
            upper = float(format(upper + step, '.2f'))
            if col_data >= lower and col_data <= upper:
                str_interval = '[' + str(int(lower)) + '-' + str(int(upper)) + ']'
                # print('Lower : ' + str(lower) + ' Upper : ' + str(upper)
                #       + ' Value : ' + str(col_data) + ' Interval : ' + str_interval)
                return self._is_name(mapper['COL'], str_interval)

        raise ValueError('Value is not between the intervals check preprocessor::discretize')

[docs]    def binarize(self, mapper, col_data):
        """
        Binarize the attribute data using mapper

        :param col_data: Categorical data assumed to be between -9 and 25
        :param mapper: Corresponding mapper of this field
        :return: Binarized field (str)
        """
        if mapper is None:
            print("Give an appropriate mapper")
            return

        if col_data < -9:
            raise ValueError('Values cannot be less than -9 - check binarize method in preprocess.py')

        # Change this, in case if we break something
        max_categorical_value = 25
        if col_data > max_categorical_value:
            raise ValueError('Values cannot be more than 25 - check binarize method in preprocess.py')

        # Return proper COL_IS_ATTR name
        if col_data in mapper['VALS'].keys():
            if mapper['OTHERS'] is not None:
                if col_data in mapper['OTHERS']:
                    # Others
                    nm = self._is_name(col=mapper['COL'], attr="OTHERS")
                elif col_data in mapper['VALS']:
                    # Map as standalone field
                    nm = self._is_name(col=mapper['COL'], attr=mapper['VALS'][col_data])
                else:
                    raise Exception("Something unusual in preprocessor::binarize happened")
            else:
                # Map as standalone field
                nm = self._is_name(col=mapper['COL'], attr=mapper['VALS'][col_data])

            # Return the name
            return nm
        else:
            raise ValueError('This key is not inside our mapper VALS - check binarize method in preprocess.py')

[docs]    def save_transactions(self, path = "transactions.csv"):
        """
        Save the preprocessed transactions into a file

        :param path: Path to be saved
        :return: Returns true on successful save
        """
        print('Saving the transactions into {}'.format(path))
        start_t = time.clock()
        with open(path, 'w') as f:
            f.write("ID,ITEMS\n")
            for t in self.transactions:
                print_str = str(t['ID'])
                for i in t['ITEMS'].keys():
                    print_str += "," + i
                print_str += "\n"
                f.write(print_str)
        # Performance measurements
        total_t = str(format(time.clock() - start_t, '.4f'))
        print("Save procedure took {:>10} seconds"
              .format(total_t))
        return True

    def _print_transactions(self):
        """
        Used to print transactions in csv format

        :return:
        """
        print_str = "ID,ITEMS\n"
        for t in self.transactions:
            print_str += str(t['ID'])
            for i in t['ITEMS'].keys():
                print_str += "," + i
            print_str += "\n"
        print(print_str)

    def _is_name(self, col, attr):
        """
        Used to construct the itemset name with combination of column and attiribute

        :param col: Column name of the data (str)
        :param attr: Attribute name of the data (str)
        :return: Itemset name (str)
        """
        return col.upper() + "_IS_" + attr.upper()

    # Until getting nice representation using files(possibly JSON) use this structure
    # later we can create the file structure and parser for that.
[docs]    class Mapper:
        """
        Used to map the fields
        """
        def __init__(self):
            # Some fields can change
            self.sex = {'COL': 'SEX', 'TYPE': 'BINARY', 'STR': 24, 'END': 25,
                        'OTHERS': None,
                        'VALS': {1: 'MALE', 2: 'FEMALE'}}

            # Combine fields that are in others together
            self.race = {'COL': 'RACE', 'TYPE': 'CATEGORICAL', 'STR': 26, 'END': 27,
                         'OTHERS': {4: 'HISP_NR', 5: 'HISP_RC', 3: 'BLACK'},
                         'VALS': {1: 'AMER', 2: 'ASIA', 3: 'BLACK',
                                  4: 'HISP_NR', 5: 'HISP_RC', 6: 'MULT',
                                  7: 'WHITE'}}

            # SCORE_IS-20_60 , 35.12
            self.score = {'COL': 'SCORE', 'TYPE': 'CONTINIOUS', 'STR': 106, 'END': 111,
                          'MIN': 20.91, 'MAX': 81.04, 'INTERVAL': 5}

            # Whether English is student's native language-composite
            # self.lang_native = {'COL': 'ENG_LANG_NATIVE', 'TYPE': 'BINARY', 'STR': 28, 'END': 29,
            #             'OTHERS': None,
            #             'VALS': {0: 'NO', 1: 'YES'}}

            # Family composition
            self.fam_comp = {'COL': 'FAM', 'TYPE': 'CATEGORICAL', 'STR': 42, 'END': 43,
                         'OTHERS':{4: 'GG', 5: 'M', 6: 'F', 7: 'FEG', 8: 'MAG',9: 'HALFTIME'},
                         'VALS': {1: 'MF', 2: 'MG', 3: 'FG',
                                  4: 'GG', 5: 'M', 6: 'F',
                                  7: 'FEG', 8: 'MAG',9: 'HALFTIME'}}

            # Parents' highest level of education
            self.par_edu = {'COL': 'PAR_EDU', 'TYPE': 'CATEGORICAL', 'STR': 44, 'END': 45,
                                'OTHERS': None,
                                'VALS': {1: 'UHS', 2: 'HS', 3: 'US',
                                         4: 'S', 5: 'UC', 6: 'C',
                                         7: 'M', 8: 'PHD'}}

            # Total family income from all sources 2001-composite
            self.income = {'COL': 'INCOME', 'TYPE': 'CATEGORICAL', 'STR': 54, 'END': 55,
                            'OTHERS': {1: 'NONE', 2: '0-1K', 3: '1-5K'},
                            'VALS': {1: 'NONE', 2: '0-1K', 3: '1-5K',
                                     4: '5-10K', 5: '10-15K', 6: '15-20K',
                                     7: '20-25K', 8: '25-35K', 9: '35-50K',
                                     10: '50-75K', 11: '75-100K',12: '100-200K',
                                     13: '200K-more'}}

            # How far in school student thinks will get-composite
            self.s_expect = {'COL': 'S_EXPEC', 'TYPE': 'CATEGORICAL', 'STR': 72, 'END': 73,
                                'OTHERS': None,
                                'VALS': {-1:'UK',
                                         1: 'UHS', 2: 'HS', 3: 'S',
                                         4: 'UC', 5: 'C', 6: 'M',
                                         7: 'PHD'}}

            # School control
            self.control = {'COL': 'SC_CTRL', 'TYPE': 'CATEGORICAL', 'STR': 253, 'END': 253,
                             'OTHERS': None,
                             'VALS': {1: 'PUB', 2: 'CAT', 3: 'PRI'}}

            # School urbanicity
            self.sc_loc = {'COL': 'SC_LOC', 'TYPE': 'CATEGORICAL', 'STR': 254, 'END': 254,
                          'OTHERS': None,
                          'VALS': {1: 'UR', 2: 'SUB', 3: 'RU'}}

            # Got into a physical fight at school
            # self.fight = {'COL': 'FIGHT', 'TYPE': 'CATEGORICAL', 'STR': 336, 'END': 337,
            #                  'OTHERS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP'},
            #                  'VALS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP',
            #                           1: 'NEVER', 2: '1-2', 3: '2-MORE'}}

            # Got into a physical fight at school
            self.late = {'COL': 'LATE', 'TYPE': 'CATEGORICAL', 'STR': 358, 'END': 359,
                          'OTHERS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP'},
                          'VALS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP',
                                   1: 'NEVER', 2: '1-2', 3: '3-6',
                                   4: '7-9', 5: '10-more'}}

            # How often student completes homework
            self.homework = {'COL': 'HOMEWORK', 'TYPE': 'CATEGORICAL', 'STR': 1610, 'END': 1611,
                         'OTHERS': {-9: 'MISSING', -6: 'MUL_RESP', -4: 'NO_ASW',
                                    -3: 'SKIP_ANS',-1 : 'DONT_K'},
                         'VALS': {-9: 'MISSING', -6: 'MUL_RESP', -4: 'NO_ASW',
                                  -3: 'SKIP_ANS',-1 : 'DONT_K',
                                  1: 'NEVER', 2: 'RARELY', 3: 'SOMET',
                                  4: 'MOSTT', 5: 'ALLT'}}

            # Spoke to parents about accomplishments (English)
            self.sh_accomp = {'COL': 'SH_ACCOMP', 'TYPE': 'CATEGORICAL', 'STR': 1590, 'END': 1591,
                         'OTHERS': {-9: 'MISSING', -4: 'NO_ASW', -3: 'SKIP_ANS'},
                         'VALS': {-9: 'MISSING', -4: 'NO_ASW', -3: 'SKIP_ANS',
                                  0: 'NO', 1: 'YES'}}

            #Spoke to parents about poor performance (English)
            self.sh_poorp = {'COL': 'SH_POORP', 'TYPE': 'CATEGORICAL', 'STR': 1582, 'END': 1583,
                              'OTHERS': {-9: 'MISSING', -6: 'MUL_RESP', -4: 'NO_ASW',
                                         -3: 'SKIP_ANS'},
                              'VALS': {-9: 'MISSING', -6: 'MUL_RESP', -4: 'NO_ASW',
                                       -3: 'SKIP_ANS',
                                       0: 'NO', 1: 'YES'}}

            # Recognized for good grades
            self.good_grec = {'COL': 'RECOG', 'TYPE': 'CATEGORICAL', 'STR': 350, 'END': 351,
                         'OTHERS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP'},
                         'VALS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP',
                                  0: 'NO', 1: 'YES'}}

            # How much likes school
            self.likes_s = {'COL': 'LIKES_S', 'TYPE': 'CATEGORICAL', 'STR': 428, 'END': 429,
                         'OTHERS': {-9: 'MISSING', -6: 'MUL_RESP',-1 : 'DONT_K'},
                         'VALS': {-9: 'MISSING', -6: 'MUL_RESP', -1 : 'DONT_K',
                                  1: 'NO', 2: 'SOME', 3: 'YES'}}

            # Use of school library for assignments
            self.library = {'COL': 'LIBRARY', 'TYPE': 'CATEGORICAL', 'STR': 708, 'END': 709,
                            'OTHERS': {-9: 'MISSING', -7: 'NOT_INTERV', -6: 'MUL_RESP',
                                       -3: 'SKIP_ANS'},
                            'VALS': {-9: 'MISSING', -6: 'MUL_RESP', -1: 'DONT_K', -3: 'SKIP_ANS',
                                     -7: 'NOT_INTERV',
                                     1: 'NEVER', 2: 'RARELY', 3: 'SOMET', 4: 'OFTEN'}}