Index: trunk/SDToolBox/extract_data_EARTH.py =================================================================== diff -u -r60 -r61 --- trunk/SDToolBox/extract_data_EARTH.py (.../extract_data_EARTH.py) (revision 60) +++ trunk/SDToolBox/extract_data_EARTH.py (.../extract_data_EARTH.py) (revision 61) @@ -68,171 +68,4 @@ return output_data - def __get_filtered_dict(self): - """Filters the defined dictionary with only the - values provided by the user as input_variables. - - Returns: - dict -- Dictionary of type str: str. - """ - return { - k: v - for k, v in self._variable_dict.items() - if k in self._input_variables} - - def __get_initial_extraction_data(self): - """Gets the basic elements for extracting ERA5 data. - - Returns: - Tuple(dict, OutputData, dict) -- - Tuple of values needed for extracting data. - """ - filtered_dict = self.__get_filtered_dict() - - output_data = OutputData( - self._input_variables - ) - cases_dict = output_data.get_data_dict() - - # longitude should be found as the 'x' in the first coordinate of - self.__input_lon = [ - self.__check_for_longitude(lon) - for lon in self._input_lon] - - return filtered_dict, output_data, cases_dict - - def __get_case_subset_from_netcdf( - self, - case_file_path: str, - cases_dict: str, - nn_idx: Tuple[int, int], - variable_name: str, - n_variable: int): - """Gets all the values from a netcdf for the given variable - and delimited nearest neighbors. - - Arguments: - case_file_path {str} -- Path to the netcdf file. - cases_dict {str} -- Output values. - nn_idx {Tuple[int, int]} -- Nearest Neighbors lon/lat. - variable_name {str} -- Name of the variable to extract. - n_variable {int} -- Index of the variable to search. - - Returns: - Tuple[int, int] -- Nearest neigbors lon/lat. - """ - - # If file does not exist simply go to the next one - if not os.path.exists(case_file_path): - print( - 'File {}'.format(case_file_path) + - 'does not exist or could not be found.') - return - - if not nn_idx: - nn_idx = self.__get_corrected_lon_lat( - case_file_path, cases_dict - ) - - # Lazy loading of the dataset. - with Dataset(case_file_path, 'r', self._ds_format) \ - as case_dataset: - cases_dict[OutputData.var_val_key][variable_name] = \ - self.__get_variable_subset( - cases_dict[OutputData.var_val_key][variable_name], - case_dataset, - variable_name, - nn_idx - ) - # Get the time for the variable. - if n_variable == 0: - # add the lines to get the reference time - # automatically just in case - reftime = \ - case_dataset[OutputData.var_time_key].units.split(' ') - # This is an assumption that all the grids have - # the same scale in regards of time. - cases_dict[OutputData.var_time_key].extend( - [datetime.strptime( - reftime[2]+' '+reftime[3], - '%Y-%m-%d %H:%M:%S') - + timedelta(hours=int(ti)) - for ti in case_dataset[self._time_key][:]] - ) - return nn_idx - - def __get_variable_subset( - self, - variable_values: list, - netcdf_dataset: Dataset, - variable_name: str, - nn_idx): - """Gets the subset of vaues for the given variable. - - Arguments: - variable_values {list} -- Stored values. - netcdf_dataset {Dataset} -- Input netCDF dataset. - variable_name {str} -- Name of the variable. - nn_idx {duple} -- Duple of lon or lat index. - - Returns: - Array -- Array of values. - """ - nn_lon_idx, nn_lat_idx = nn_idx - if variable_values is None: - return self.__get_case_subset( - netcdf_dataset, - variable_name, - nn_lon_idx, - nn_lat_idx) - return np.concatenate( - (variable_values, - self.__get_case_subset( - netcdf_dataset, - variable_name, - nn_lon_idx, - nn_lat_idx)), - axis=0) - - def __get_corrected_lon_lat( - self, ref_file_path: str, cases_dict: dict): - """Gets the corrected index and value for the given input coordinates. - - Arguments: - directory_path {str} -- Parent directory. - cases_dict {dict} -- Dictionary with all values that need format. - - Returns: - [type] -- [description] - """ - nn_lon_idx = [] - nn_lat_idx = [] - - # Extract index and value for all input lat, lon. - with Dataset(ref_file_path, 'r', self._ds_format) \ - as ref_dataset: - lat_list = ref_dataset.variables[self.__lat_key][:] - lon_list = ref_dataset.variables[self.__lon_key][:] - for lon_point in self._input_lon: - idx, value = d_a.get_nearest_neighbor( - lon_point, - lon_list) - cases_dict[OutputData.var_lon_key].append(value) - nn_lon_idx.append(idx) - for lat_point in self._input_lat: - idx, value = d_a.get_nearest_neighbor( - lat_point, - lat_list) - cases_dict[OutputData.var_lat_key].append(value) - nn_lat_idx.append(idx) - return nn_lon_idx, nn_lat_idx - - @staticmethod - def __get_case_subset(dataset, variable_name, lon, lat): - return dataset[variable_name][:, lat, lon] - - @staticmethod - def __check_for_longitude(longitude): - if longitude > 180: - return longitude-180 - return longitude + \ No newline at end of file Index: trunk/SDToolBox/input_data.py =================================================================== diff -u -r59 -r61 --- trunk/SDToolBox/input_data.py (.../input_data.py) (revision 59) +++ trunk/SDToolBox/input_data.py (.../input_data.py) (revision 61) @@ -25,16 +25,13 @@ input_coordinates = [] input_variables = [] input_years = [] + input_scenarios = [] # list of Lon Lat, for now they get extracted from the # input_coordinates. _input_lon = [] _input_lat = [] - # Earth scenarios? - input_EARTH_scenario = None - values_EARTH_scenarios = ['RCP45', 'RCP85', 'HIST'] - # These parameters are set in the extraction methods # Default values [-180, 180] min_longitude = -180 Index: trunk/SDToolBox/extract_data.py =================================================================== diff -u -r59 -r61 --- trunk/SDToolBox/extract_data.py (.../extract_data.py) (revision 59) +++ trunk/SDToolBox/extract_data.py (.../extract_data.py) (revision 61) @@ -27,13 +27,7 @@ class ExtractData: - _earth_lon_key = 'lon' - _earth_lat_key = 'lat' - _earth_var_dict = { - 'var151': 'var151' - } - @staticmethod def get_era_5(directory_path: str, input_data: InputData): """Extracts a collection of netCDF4 ERA5 subsets based on the @@ -80,55 +74,50 @@ """ # Set ERA5 min / max longitudes. - input_data.min_longitude = -360 - input_data.max_longitude = 360 + input_data.min_longitude = -180 + input_data.max_longitude = 180 # Validate the input data. input_data.validate() # Define extractor. data_extractor = ExtractData.__EarthExtractor() - # return data_extractor.extract_subset( - # directory_path=directory_path, - # input_data=input_data, - # values_dict=filtered_dict - # ) + return data_extractor.extract_subset( + directory_path=directory_path, + input_data=input_data + ) class BaseExtractor(ABC): # Region Abstract methods / properties. @property @abstractmethod - def lon_key(self): + def lon_key(self) -> str: raise Exception(om.error_needs_to_be_in_subclass) @property @abstractmethod - def lat_key(self): + def lat_key(self) -> str: raise Exception(om.error_needs_to_be_in_subclass) @property @abstractmethod - def variable_dict(self): + def file_iterator(self) -> Dict[str, str]: raise Exception(om.error_needs_to_be_in_subclass) @abstractmethod - def get_netcdf_file_name( - self, - dir_path: str, - variable_key: str, - year: int): - raise Exception(om.error_needs_to_be_in_subclass) - - @abstractmethod def get_case_time_values( self, ymd: str, hmsmm: str, - case_values: Dataset): + case_values: Dataset) -> List[datetime]: raise Exception(om.error_needs_to_be_in_subclass) + @abstractmethod + def set_file_iterator(self, input_data: InputData, dir_path: str): + raise Exception(om.error_needs_to_be_in_subclass) + # End region def extract_subset( self, @@ -145,99 +134,64 @@ Returns: OutputData -- Object with the extracted data. """ - filtered_dict = self.__get_filtered_dict( - values_selected=input_data.input_variables - ) - output_data = OutputData(input_data.input_variables) - nn_idx = None - # Iterate over all possible combinations of variable-year. - for n_variable, variable_name in enumerate(filtered_dict): - for year in input_data.input_years: - case_file_path = self.get_netcdf_file_name( - dir_path=directory_path, - variable_key=filtered_dict.get(variable_name), - year=year - ) - # Process the subset for the file. - print( - 'Extracting variable: {},'.format(year) + - ' year {}.'.format(variable_name)) - nn_idx = self.__get_case_subset_from_netcdf( - case_file_path=case_file_path, - input_data=input_data, - output_data=output_data, - nn_idx=nn_idx, - variable_name=variable_name, - n_variable=n_variable - ) - return output_data + self.set_file_iterator( + input_data=input_data, + dir_path=directory_path) - def __get_case_subset_from_netcdf( - self, - case_file_path: str, - input_data: InputData, - output_data: OutputData, - nn_idx: Tuple[int, int], - variable_name: str, - n_variable: int): - """Gets all the values from a netcdf for the given variable - and delimited nearest neighbors. - - Arguments: - case_file_path {str} -- Path to the netcdf file. - input_data {InputData} -- Data structure with input params. - output_data {OutputData} -- Output values. - nn_idx {Tuple[int, int]} -- Nearest Neighbors lon/lat. - variable_name {str} -- Name of the variable to extract. - n_variable {int} -- Index of the variable to search. - - Returns: - Tuple[int, int] -- Nearest neigbors lon/lat. - """ - - # If file does not exist simply go to the next one - if not os.path.exists(case_file_path): - print( - 'File {}'.format(case_file_path) + - 'does not exist or could not be found.') - return - - if not nn_idx: - nn_idx = self.__get_nearest_neighbors_lon_lat( - ref_file_path=case_file_path, + output_data = OutputData(input_data.input_variables) + # Set the nearest neighbors and the time reference. + nn_idx = self.__get_nearest_neighbors_lon_lat( + ref_file_path=self.__first_filepath, input_data=input_data, cases_dict=output_data.data_dict ) - # Lazy loading of the dataset. - with Dataset(case_file_path, 'r', output_data._ds_format) \ - as input_dataset: - output_data.set_in_var_dict( - var_name=variable_name, - value=self.__get_variable_subset( - output_data.get_from_var_dict(variable_name), - input_dataset, - variable_name, - nn_idx + # Iterate over all possible combinations of variable-year. + for file_entry in self.file_iterator: + variable_name = file_entry.get('variable') + filepath = file_entry.get('filepath') + variable_key = file_entry.get('key') + # Process the subset for the file. + print( + 'Extracting variable: {},'.format(variable_name) + + ' year {}.'.format(file_entry['year'])) + # Lazy loading of the dataset. + with Dataset(filepath, 'r', output_data._ds_format) \ + as input_dataset: + output_data.set_in_var_dict( + var_name=variable_key, + value=self.__get_variable_subset( + output_data.get_from_var_dict(variable_key), + input_dataset, + variable_key, + nn_idx + ) ) - ) + # Set the time if needed. + if variable_key == self.__first_variable_key: + # add the lines to get the reference time + reftime = \ + input_dataset[ + OutputData.var_time_key].units.split(' ') + # This is an assumption that all the grids have + # the same scale in regards of time. + output_data.data_dict[OutputData.var_time_key].extend( + self.get_case_time_values( + ymd=reftime[2], + hmsmm=reftime[3], + case_values=input_dataset) + ) - # Get the time for the variable. - if n_variable == 0: - # add the lines to get the reference time - # automatically just in case - reftime = \ - input_dataset[OutputData.var_time_key].units.split(' ') - # This is an assumption that all the grids have - # the same scale in regards of time. - output_data.data_dict[OutputData.var_time_key].extend( - self.get_case_time_values( - ymd=reftime[2], - hmsmm=reftime[3], - case_values=input_dataset) - ) - return nn_idx + return output_data + @property + def __first_filepath(self) -> str: + return self.file_iterator[0]['filepath'] + + @property + def __first_variable_key(self) -> str: + return self.file_iterator[0]['key'] + def __get_variable_subset( self, variable_values: list, @@ -303,19 +257,22 @@ ) return nn_lon_idx, nn_lat_idx - def __get_filtered_dict(self, values_selected: List[str]): + def _get_filtered_dict( + self, + values_selected: List[str], + values_dict: Dict[str, str]) -> Dict[str, str]: """Returns a dictionary with only the requested variables from the user. Arguments: values_selected {List[str]} -- Selected variables. - + values_dict {Dict[str, str]} -- Dictionary to filter. Returns: Dict[str, str] -- Filtered dictionary. """ return { k: v - for k, v in self.variable_dict.items() + for k, v in values_dict.items() if k in values_selected} def __set_nn( @@ -374,6 +331,8 @@ 'v10': 'wind_v' } + __file_iterator = [] + @property def lon_key(self): return self.__era5_lon_key @@ -383,28 +342,28 @@ return self.__era5_lat_key @property - def variable_dict(self): - return self.__era5_var_dict + def file_iterator(self) -> Dict[str, str]: + return self.__file_iterator - def get_netcdf_file_name( + def __get_netcdf_filepath( self, dir_path: str, - variable_key: str, + variable: str, year: int): """Gets the era5 filepath. Arguments: dir_path {str} -- Parent directory. - variable_key {str} -- Variable in file name. + variable {str} -- Variable in file name. year {int} -- Year in file name. Returns: str -- File path location based on ERA5 format. """ # Find the matching file base_file_name = '' + \ - 'era5_Global_{}_{}.nc'.format(variable_key, year) - case_dir = os.path.join(dir_path, variable_key) + 'era5_Global_{}_{}.nc'.format(variable, year) + case_dir = os.path.join(dir_path, variable) file_path = os.path.join(case_dir, base_file_name) return file_path @@ -430,5 +389,107 @@ + timedelta(hours=int(ti)) for ti in case_values['time'][:]] + def set_file_iterator(self, input_data: InputData, dir_path: str): + filtered_dict = self._get_filtered_dict( + input_data.input_variables, + self.__era5_var_dict) + + for key_name, variable_name in filtered_dict.items(): + for year in input_data.input_years: + file_path = self.__get_netcdf_filepath( + dir_path=dir_path, + variable=variable_name, + year=year + ) + # If file does not exist simply go to the next one + if not os.path.exists(file_path): + print( + 'File {}'.format(file_path) + + 'does not exist or could not be found.') + continue + self.__file_iterator.append( + { + 'year': year, + 'filepath': file_path, + 'variable': variable_name, + 'key': key_name + }) + class __EarthExtractor(BaseExtractor): - pass + __earth_lon_key = 'lon' + __earth_lat_key = 'lat' + + __earth_var_dict = { + 'var151': 'var151' + } + + possible_scenarios = ['RCP45', 'RCP85', 'HIST'] + + __file_iterator = {} + + @property + def lon_key(self): + return self.__earth_lon_key + + @property + def lat_key(self): + return self.__earth_lat_key + + @property + def file_iterator(self) -> Dict[str, str]: + return self.__file_iterator + + def get_case_time_values( + self, + ymd: str, + hmsmm: str, + case_values: Dataset): + """Returns a list of formatted datetime values from + a given dataset. + + Arguments: + ymd {str} -- Datetime string: Year Month Day + hmsmm {str} -- Datetime string: Hour Min. Sec. Ms. + case_values {Dataset} -- Input dataset. + + Returns: + List[datetime] -- List of formatted datetimes. + """ + return [datetime.strptime( + '{} {}'.format(ymd, hmsmm), + '%Y-%m-%d %H:%M:%S') + + timedelta(hours=int(ti)) + for ti in case_values['time'][:]] + + def __get_files_for_scenario(self) -> List[Dict[str, str]]: + pass + + def set_file_iterator(self, input_data: InputData, dir_path: str): + filtered_vars = self._get_filtered_dict( + input_data.input_variables, + self.__earth_var_dict) + + for scenario in input_data.input_scenarios: + if scenario.upper() not in self.possible_scenarios: + continue + files_for_scenario = self.__get_files_for_scenario() + for key_name, variable_name in filtered_vars.items(): + for year in input_data.input_years: + file_path = self.__get_netcdf_filepath( + dir_path=dir_path, + variable=variable_name, + year=year + ) + # If file does not exist simply go to the next one + if not os.path.exists(file_path): + print( + 'File {}'.format(file_path) + + 'does not exist or could not be found.') + continue + self.__file_iterator.append( + { + 'year': year, + 'filepath': file_path, + 'variable': variable_name, + 'key': key_name + })