preprocess_data(df, variables_of_interest=variables_of_interest, weekday=True, unique_trips_only=False, process_durations=False)

Preprocess the NHTS data.

Parameters:

Name Type Description Default
df DataFrame

The input dataframe containing NHTS data.

required
variables_of_interest dict

A dictionary specifying the variables of interest and their types (categorical, numerical, time).

variables_of_interest
weekday bool

If True, processes data for weekdays; if False, processes data for weekends. Defaults to True.

True
unique_trips_only bool

If True, removes duplicate trips based on id, start_time, and end_time. Defaults to False.

False
process_durations bool

If True, processes the durations of activities and travels. Defaults to False.

False

Returns:

Type Description

pd.DataFrame: The preprocessed dataframe.

Notes
  • The function replaces specific values indicating no data with NaN and drops rows with NaN values.
  • It filters the dataframe to include only the specified variables of interest.
  • The function processes categorical, numerical, and time variables according to their specified types.
  • It renames columns to standardize names and drops certain columns not needed for further analysis.
  • If unique_trips_only is True, duplicate trips are removed.
  • If process_durations is True, the function calculates the travel and activity durations.

Examples:

>>> variables_of_interest = {
...     'Kön': {'type': 'categorical', 'categories': {1: 'Male', 2: 'Female'}},
...     'Åldersgrupp': {'type': 'categorical', 'categories': {1: '0-17', 2: '18-34', 3: '35-64', 4: '65+'}},
...     'Starttid': {'type': 'time'},
...     'Sluttid': {'type': 'time'},
...     'Reslängd': {'type': 'numerical'},
...     # additional variables...
... }
>>> df = preprocess_data(df, variables_of_interest, weekday=True, unique_trips_only=True, process_durations=True)
Source code in tripsender\nhts.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
def preprocess_data(df,variables_of_interest = variables_of_interest, weekday = True, unique_trips_only = False, process_durations = False):
    """
    Preprocess the NHTS data.

    Args:
        df (pd.DataFrame): The input dataframe containing NHTS data.
        variables_of_interest (dict): A dictionary specifying the variables of interest and their types (categorical, numerical, time).
        weekday (bool): If True, processes data for weekdays; if False, processes data for weekends. Defaults to True.
        unique_trips_only (bool): If True, removes duplicate trips based on id, start_time, and end_time. Defaults to False.
        process_durations (bool): If True, processes the durations of activities and travels. Defaults to False.

    Returns:
        pd.DataFrame: The preprocessed dataframe.

    Notes:
        - The function replaces specific values indicating no data with NaN and drops rows with NaN values.
        - It filters the dataframe to include only the specified variables of interest.
        - The function processes categorical, numerical, and time variables according to their specified types.
        - It renames columns to standardize names and drops certain columns not needed for further analysis.
        - If `unique_trips_only` is True, duplicate trips are removed.
        - If `process_durations` is True, the function calculates the travel and activity durations.

    Examples:
        >>> variables_of_interest = {
        ...     'Kön': {'type': 'categorical', 'categories': {1: 'Male', 2: 'Female'}},
        ...     'Åldersgrupp': {'type': 'categorical', 'categories': {1: '0-17', 2: '18-34', 3: '35-64', 4: '65+'}},
        ...     'Starttid': {'type': 'time'},
        ...     'Sluttid': {'type': 'time'},
        ...     'Reslängd': {'type': 'numerical'},
        ...     # additional variables...
        ... }
        >>> df = preprocess_data(df, variables_of_interest, weekday=True, unique_trips_only=True, process_durations=True)
    """

    if weekday:
        wd = 1
    else:
        wd = 2



    # Replace ',' with '.' in 'Reslängd'
    df['Reslängd'] = df['Reslängd'].str.replace(',', '.')
    # Replace ',' with '.' in 'VIKT_individ'
    df['VIKT_individ'] = df['VIKT_individ'].str.replace(',', '.')
    # Filter df to only include variables of interest
    df = df[list(variables_of_interest.keys())].reset_index(drop=True)
    # Replace no data with NaN (-111, 99998, '', blank)
    df = df.replace([-111, 999998, 99998, '', ' '], float('NaN'))
    # Drop rows with NaN
    df = df.dropna().reset_index(drop=True)
    # Sort by age
    #VARDAG_HELG == 1 for weekday
    #VARDAG_HELG == 2 for weekend
    df = df[df['VARDAG_HELG'] == wd].reset_index(drop=True)
    #Antal_resor_per_pers > 1
    df = df[df['Antal_resor_per_pers'] > 1].reset_index(drop=True)
    #TODO - Remove entries for an LPNR with "Home" not the last trip
    #Arende == 8 not the last trip

    # Replace categorical values with strings
    for variable in variables_of_interest:
        if variables_of_interest[variable]['type'] == 'categorical':
            # Change data type to int
            df[variable] = df[variable].astype(float)
            # Replace values with strings
            df[variable] = df[variable].replace(variables_of_interest[variable]['categories'])
        elif variables_of_interest[variable]['type'] == 'numerical':
            # Change data type to int
            df[variable] = df[variable].astype(float)
        elif variables_of_interest[variable]['type'] == 'time':
            # Change data type to datetime with format HH:MM:SS
            df[variable] = pd.to_datetime(df[variable], format='%H:%M:%S')
            # Only show hour and minute
            #df[variable] = df[variable].dt.strftime('%H:%M')
            # Set year, month and day to Today from datetime
            df[variable] = df[variable].apply(lambda dt: dt.replace(year=year, month=month, day=day))

    # Rename columns - kön to kon Ålder to alder
    df = df.rename(columns={
        'LPNR': 'id',
        'Kön': 'sex',
        'Åldersgrupp': 'age_group',
        'Bostadstyp' : 'house_type',
        'Hushållstyp' : 'household_type',
        'Antal_barn' : 'child_count',
        'Antal_bilar' : 'car_count',
        'Antal_vuxna' : 'adult_count',
        'Starttid' : 'start_time',
        'Huvud_fm' : 'mode',
        'Sluttid' : 'end_time',
        'Reslängd' : 'distance_km',
        'Arende' : 'purpose',
        'VIKT_individ' : 'weight_individual'
        })

    #Drop 'VARDAG_HELG' and 'Antal_resor_per_pers'
    df = df.drop(['VARDAG_HELG', 'Antal_resor_per_pers'], axis=1)


    df = df.sort_values(by=['id', 'start_time'])
    # Create a column called activity_sequence which is the activity number starting from 0 for each id
    df['activity_sequence'] = df.groupby('id').cumcount()
    # create a column called duration of previous activity
    # This is the time between end_time of previous activity and start_time of current activity
    # If there is no previous activity, then duration is 0
    # If there is no next activity, then duration is 0
    # Sort dataframe based on id and activity_sequence
    df = df.sort_values(by=['id', 'start_time'])
    if unique_trips_only:
        # Drop duplicates based on id, start_time and end_time
        df = df.drop_duplicates(subset=['id', 'start_time', 'end_time']).reset_index(drop=True)

    if process_durations:
        # Create the necessary columns for the entire dataframe
        df['travel_duration_minutes'] = df['end_time'] - df['start_time']

        # Shift the start times within each group
        df['next_travel_start_time'] = df.groupby('id')['start_time'].shift(-1)

        # Calculate activity durations
        df['activity_duration_minutes'] = df['next_travel_start_time'] - df['end_time']

        # Fill NaN values in 'next_travel_start_time' and 'activity_duration_minutes' if required
        df['next_travel_start_time'] = df['next_travel_start_time'].fillna(method='ffill')
        df['activity_duration_minutes'] = df['activity_duration_minutes'].fillna(pd.Timedelta(seconds=0))
    return df