Index: trunk/SDToolBox/output_messages.py =================================================================== diff -u -r60 -r62 --- trunk/SDToolBox/output_messages.py (.../output_messages.py) (revision 60) +++ trunk/SDToolBox/output_messages.py (.../output_messages.py) (revision 62) @@ -23,3 +23,4 @@ error_needs_to_be_in_subclass = 'Needs to be implemented in subclasses.' error_no_gradient_was_calculated = 'No gradient was calculated, ' + \ ' please calculate the gradient on the xarray before proceeding' +error_function_not_implemented = 'Functionality not implemented.' Fisheye: Tag 62 refers to a dead (removed) revision in file `trunk/SDToolBox/extract_data_EARTH.py'. Fisheye: No comparison available. Pass `N' to diff? Index: trunk/SDToolBox/extract_data.py =================================================================== diff -u -r61 -r62 --- trunk/SDToolBox/extract_data.py (.../extract_data.py) (revision 61) +++ trunk/SDToolBox/extract_data.py (.../extract_data.py) (revision 62) @@ -7,8 +7,9 @@ import sys import os -from typing import List, Set, Dict, Tuple, Optional +from typing import List, Set, Dict, Tuple, Optional, Any from abc import ABC, abstractmethod +import itertools from SDToolBox import output_messages as om from SDToolBox.input_data import InputData @@ -58,7 +59,7 @@ @staticmethod def get_era5_GTSM(directory_path: str, input_data: InputData): - pass + raise Exception(om.error_function_not_implemented) @staticmethod def get_earth(directory_path: str, input_data: InputData): @@ -89,7 +90,16 @@ ) class BaseExtractor(ABC): + file_var_key = 'variable' + file_key_key = 'key' + file_fpath_key = 'filepath' + file_year_key = 'year' + file_month_key = 'month' + file_scenario_key = 'scenario' + netcdf_format = 'netCDF4' + __file_iterator = [] + # Region Abstract methods / properties. @property @abstractmethod @@ -103,9 +113,17 @@ @property @abstractmethod - def file_iterator(self) -> Dict[str, str]: + def var_dict(self) -> Dict[str, str]: raise Exception(om.error_needs_to_be_in_subclass) + @property + def __first_filepath(self) -> str: + return self.__file_iterator[0][self.file_fpath_key] + + @property + def __first_variable_key(self) -> str: + return self.__file_iterator[0][self.file_key_key] + @abstractmethod def get_case_time_values( self, @@ -115,16 +133,59 @@ raise Exception(om.error_needs_to_be_in_subclass) @abstractmethod - def set_file_iterator(self, input_data: InputData, dir_path: str): + def get_file_combinations(**args) -> List[List[Any]]: raise Exception(om.error_needs_to_be_in_subclass) + @abstractmethod + def get_filepath( + self, + dir_path: str, + file_entry: List[List[Any]]) -> str: + raise Exception(om.error_needs_to_be_in_subclass) + + @abstractmethod + def get_new_file_iter(self, file_entry, file_path) -> Dict[str, str]: + raise Exception(om.error_needs_to_be_in_subclass) + + def set_file_iterator( + self, + input_data: InputData, + dir_path: str) -> List[Dict[str, str]]: + """Sets the file iterator based on the possible file + combinations. + + Arguments: + file_combinations {List[List[Any]]} + -- File combinations for the dataset type. + dir_path {str} -- Path to the parent directory. + + Returns: + List[Dict[str, str]] -- [description] + """ + # Get combinations + file_combinations = self.get_file_combinations(input_data) + # Iterate over f_combs + for file_entry in file_combinations: + file_path = self.get_filepath( + dir_path=dir_path, + file_entry=file_entry) + if not os.path.exists(file_path): + print('File not found {}'.format(file_path)) + continue + self.__file_iterator.append( + self.get_new_file_iter( + file_entry=file_entry, + file_path=file_path + )) + # End region + def extract_subset( self, directory_path: str, input_data: InputData): - """Extracts an ERA5 subset given a directory path and - the input data. + """Extracts an netCDF subset given a directory path and + with boundaries given through the input_data. Arguments: directory_path {str} @@ -139,24 +200,28 @@ dir_path=directory_path) output_data = OutputData(input_data.input_variables) + if len(self.__file_iterator) == 0: + return output_data + # Set the nearest neighbors and the time reference. nn_idx = self.__get_nearest_neighbors_lon_lat( ref_file_path=self.__first_filepath, input_data=input_data, cases_dict=output_data.data_dict ) + time_entry_refs = self.__get_time_ref_group() # Iterate over all possible combinations of variable-year. - for file_entry in self.file_iterator: - variable_name = file_entry.get('variable') - filepath = file_entry.get('filepath') - variable_key = file_entry.get('key') + for file_entry in self.__file_iterator: + variable_name = file_entry.get(self.file_var_key) + filepath = file_entry.get(self.file_fpath_key) + variable_key = file_entry.get(self.file_key_key) # Process the subset for the file. print( 'Extracting variable: {},'.format(variable_name) + - ' year {}.'.format(file_entry['year'])) + ' year {}.'.format(file_entry.get(self.file_year_key))) # Lazy loading of the dataset. - with Dataset(filepath, 'r', output_data._ds_format) \ + with Dataset(filepath, 'r', self.netcdf_format) \ as input_dataset: output_data.set_in_var_dict( var_name=variable_key, @@ -167,12 +232,11 @@ nn_idx ) ) - # Set the time if needed. - if variable_key == self.__first_variable_key: + # Set the time if the file needs to be considered. + if file_entry in time_entry_refs: # add the lines to get the reference time - reftime = \ - input_dataset[ - OutputData.var_time_key].units.split(' ') + reftime = input_dataset[ + OutputData.var_time_key].units.split(' ') # This is an assumption that all the grids have # the same scale in regards of time. output_data.data_dict[OutputData.var_time_key].extend( @@ -184,14 +248,13 @@ return output_data - @property - def __first_filepath(self) -> str: - return self.file_iterator[0]['filepath'] + def __get_time_ref_group(self) -> List[str]: + return [ + file_entry + for file_entry in self.__file_iterator + if file_entry[self.file_year_key] == + self.__file_iterator[0][self.file_year_key]] - @property - def __first_variable_key(self) -> str: - return self.file_iterator[0]['key'] - def __get_variable_subset( self, variable_values: list, @@ -229,7 +292,7 @@ self, ref_file_path: str, input_data: InputData, - cases_dict: dict): + cases_dict: dict) -> Tuple[List[int], List[int]]: """Gets the corrected index and value for the given input coordinates. @@ -240,10 +303,11 @@ -- Dictionary with all values that need format. Returns: - Tuple[int, int] -- Indices of nearest neighbors. + Tuple[List[int], List[int]] + -- Indices of nearest neighbors. """ # Extract index and value for all input lat, lon. - with Dataset(ref_file_path, 'r', OutputData._ds_format) \ + with Dataset(ref_file_path, 'r', self.netcdf_format) \ as ref_dataset: nn_lat_idx = self.__set_nn( input_values=input_data._input_lat, @@ -279,7 +343,7 @@ self, input_values: List[float], reference_list: List[float], - output_values: List[float]): + output_values: List[float]) -> List[int]: """Sets the nearest neighbor for all the elements given in the points_list. @@ -301,7 +365,7 @@ return output_idx @staticmethod - def get_nearest_neighbor(value, data_array): + def get_nearest_neighbor(value, data_array) -> Tuple[int, int]: """ search for nearest decimal degree in an array of decimal degrees and return the index. @@ -331,8 +395,6 @@ 'v10': 'wind_v' } - __file_iterator = [] - @property def lon_key(self): return self.__era5_lon_key @@ -342,36 +404,35 @@ return self.__era5_lat_key @property - def file_iterator(self) -> Dict[str, str]: - return self.__file_iterator + def var_dict(self) -> Dict[str, str]: + return self.__era5_var_dict - def __get_netcdf_filepath( + def get_filepath( self, dir_path: str, - variable: str, - year: int): - """Gets the era5 filepath. + file_entry: Dict[str, str]) -> str: + """Gets the earth filepath. Arguments: dir_path {str} -- Parent directory. - variable {str} -- Variable in file name. - year {int} -- Year in file name. + file_entry {Dict[str]} -- Dict of file attributes. Returns: - str -- File path location based on ERA5 format. + str -- File path location based on EARTH format. """ # Find the matching file base_file_name = '' + \ - 'era5_Global_{}_{}.nc'.format(variable, year) - case_dir = os.path.join(dir_path, variable) - file_path = os.path.join(case_dir, base_file_name) - return file_path + 'era5_Global_{}_{}.nc'.format( + file_entry[0][1], + file_entry[1]) + case_dir = os.path.join(dir_path, file_entry[0][1]) + return os.path.join(case_dir, base_file_name) def get_case_time_values( self, ymd: str, hmsmm: str, - case_values: Dataset): + case_values: Dataset) -> List[datetime]: """Returns a list of formatted datetime values from a given dataset. @@ -389,31 +450,21 @@ + timedelta(hours=int(ti)) for ti in case_values['time'][:]] - def set_file_iterator(self, input_data: InputData, dir_path: str): + def get_file_combinations(self, input_data: InputData): filtered_dict = self._get_filtered_dict( input_data.input_variables, self.__era5_var_dict) + return itertools.product( + filtered_dict.items(), + input_data.input_years) - for key_name, variable_name in filtered_dict.items(): - for year in input_data.input_years: - file_path = self.__get_netcdf_filepath( - dir_path=dir_path, - variable=variable_name, - year=year - ) - # If file does not exist simply go to the next one - if not os.path.exists(file_path): - print( - 'File {}'.format(file_path) + - 'does not exist or could not be found.') - continue - self.__file_iterator.append( - { - 'year': year, - 'filepath': file_path, - 'variable': variable_name, - 'key': key_name - }) + def get_new_file_iter(self, file_entry, file_path) -> Dict[str, str]: + return { + self.file_var_key: file_entry[0][1], + self.file_key_key: file_entry[0][0], + self.file_year_key: file_entry[1], + self.file_fpath_key: file_path + } class __EarthExtractor(BaseExtractor): __earth_lon_key = 'lon' @@ -425,8 +476,6 @@ possible_scenarios = ['RCP45', 'RCP85', 'HIST'] - __file_iterator = {} - @property def lon_key(self): return self.__earth_lon_key @@ -436,8 +485,8 @@ return self.__earth_lat_key @property - def file_iterator(self) -> Dict[str, str]: - return self.__file_iterator + def var_dict(self) -> Dict[str, str]: + return self.__earth_var_dict def get_case_time_values( self, @@ -461,35 +510,57 @@ + timedelta(hours=int(ti)) for ti in case_values['time'][:]] - def __get_files_for_scenario(self) -> List[Dict[str, str]]: - pass + def get_filepath( + self, + dir_path: str, + file_entry: Dict[str, str]) -> str: + """Gets the Earth filepath. - def set_file_iterator(self, input_data: InputData, dir_path: str): - filtered_vars = self._get_filtered_dict( + Arguments: + dir_path {str} -- Parent directory. + file_entry {Dict[str]} -- Dict of file attributes. + + Returns: + str -- File path location based on ERA5 format. + """ + # Find the matching file + base_file_name = '' + \ + 'EC-EARTH_{}_{}_{}{}.nc'.format( + file_entry[3], + file_entry[0][1], + file_entry[1], file_entry[2]) + case_dir = os.path.join(dir_path, file_entry[3]) + return os.path.join(case_dir, base_file_name) + + def get_file_combinations(self, input_data: InputData): + """Builds a specific file iterator to be used in the parent + class. + + Arguments: + input_data {InputData} -- Input elements. + """ + filtered_dict = self._get_filtered_dict( input_data.input_variables, self.__earth_var_dict) - for scenario in input_data.input_scenarios: - if scenario.upper() not in self.possible_scenarios: - continue - files_for_scenario = self.__get_files_for_scenario() - for key_name, variable_name in filtered_vars.items(): - for year in input_data.input_years: - file_path = self.__get_netcdf_filepath( - dir_path=dir_path, - variable=variable_name, - year=year - ) - # If file does not exist simply go to the next one - if not os.path.exists(file_path): - print( - 'File {}'.format(file_path) + - 'does not exist or could not be found.') - continue - self.__file_iterator.append( - { - 'year': year, - 'filepath': file_path, - 'variable': variable_name, - 'key': key_name - }) + filtered_scenarios = [ + scenario + for scenario in input_data.input_scenarios + if scenario.upper() in self.possible_scenarios + ] + + return itertools.product( + filtered_dict.items(), + input_data.input_years, + [str(mm).zfill(2) for mm in range(1, 13)], + filtered_scenarios) + + def get_new_file_iter(self, file_entry, file_path) -> Dict[str, str]: + return { + self.file_var_key: file_entry[0][1], + self.file_key_key: file_entry[0][0], + self.file_year_key: file_entry[1], + self.file_month_key: file_entry[2], + self.file_scenario_key: file_entry[3], + self.file_fpath_key: file_path + } Fisheye: Tag 62 refers to a dead (removed) revision in file `trunk/tests/test_extract_data_EARTH.py'. Fisheye: No comparison available. Pass `N' to diff? Index: trunk/tests/test_extract_data.py =================================================================== diff -u -r59 -r62 --- trunk/tests/test_extract_data.py (.../test_extract_data.py) (revision 59) +++ trunk/tests/test_extract_data.py (.../test_extract_data.py) (revision 62) @@ -81,6 +81,37 @@ class Test_get_earth: - @pytest.mark.unittest - def test_dummy(self): - ExtractData.BaseExtractor() \ No newline at end of file + @pytest.mark.systemtest + @pytest.mark.parametrize( + "input_variables, input_scenarios", + [(['var151'], ['RCP45'])], + ids=['RCP45 simple']) + def test_when_extract_single_point_from_earth_dir_then_returns_output( + self, input_variables: List[str], input_scenarios: List[str]): + # 1. Given + # When using local data you can just replace the comment in these lines + dir_test_data = TestUtils.get_local_test_data_dir('earth_test_data') + # dir_test_data = 'P:\\metocean-data\\open\\ERA5\\data\\Global' + + input_data = InputData() + input_data.input_variables = input_variables + input_data.input_coordinates = [(4.2, 2.4), ] + input_data.input_years = [1981, 1982] + input_data.input_scenarios = input_scenarios + + # 2. When + try: + output_data = ExtractData.get_earth( + dir_test_data, + input_data) + except Exception as e_info: + pytest.fail('Exception {} thrown.'.format(str(e_info))) + + # 3. Then + assert output_data is not None + data_dict = output_data.data_dict + assert data_dict is not None + assert data_dict[output_data.var_time_key] is not None + assert data_dict[output_data.var_lon_key] is not None + assert data_dict[output_data.var_lat_key] is not None + assert data_dict[output_data.var_val_key] is not None