From 37ec6bef9388b3e6c60783cc964cc4de00a4fed4 Mon Sep 17 00:00:00 2001 From: codez0mb1e Date: Sat, 22 Oct 2022 09:25:59 +0000 Subject: [PATCH] Remove redundant --- src/azure.py | 66 --------------- src/bitfinex_crypto_parser.py | 150 ---------------------------------- 2 files changed, 216 deletions(-) delete mode 100644 src/azure.py delete mode 100644 src/bitfinex_crypto_parser.py diff --git a/src/azure.py b/src/azure.py deleted file mode 100644 index cdfcf77..0000000 --- a/src/azure.py +++ /dev/null @@ -1,66 +0,0 @@ - -# %% Import dependencies ---- -from dataclasses import dataclass -from typing import Dict, Any - -from sqlalchemy import create_engine, inspect - -import pandas as pd -import urllib - - -# %% Models -@dataclass(frozen=True) -class ConnectionSettings: - """Connection Settings""" - server: str - database: str - username: str - password: str - driver: str = '{ODBC Driver 18 for SQL Server}' - timeout: int = 30 - - -# %% Connection -class AzureDbConnection: - """Azure SQL database connection.""" - - def __init__(self, conn_settings: ConnectionSettings, echo: bool = False) -> None: - conn_params = urllib.parse.quote_plus( - 'Driver=%s;' % conn_settings.driver + - 'Server=tcp:%s.database.windows.net,1433;' % conn_settings.server + - 'Database=%s;' % conn_settings.database + - 'Uid=%s;' % conn_settings.username + - 'Pwd=%s;' % conn_settings.password + - 'Encrypt=yes;' + - 'TrustServerCertificate=no;' + - 'Connection Timeout=%s;' % conn_settings.timeout - ) - conn_string = f'mssql+pyodbc:///?odbc_connect={conn_params}' - - self._db = create_engine(conn_string, echo=echo) - - def connect(self) -> None: - """Estimate connection""" - self._conn = self._db.connect() - - def get_tables(self) -> list[str]: - """Get list of tables""" - inspector = inspect(self._db) - return [t for t in inspector.get_table_names()] - - def insert(self, inserted_data: pd.DataFrame, target_table: str, db_mapping: Dict[str, Any], chunksize: int = 10000) -> None: - inserted_data.to_sql( - con=self._db, - schema='dbo', - name=target_table, - if_exists='append', # or replace - index=False, - chunksize=chunksize, - dtype=db_mapping - ) - - def dispose(self) -> None: - """Dispose opened connections""" - self._conn.close() - self._db.dispose() diff --git a/src/bitfinex_crypto_parser.py b/src/bitfinex_crypto_parser.py deleted file mode 100644 index 86c9f61..0000000 --- a/src/bitfinex_crypto_parser.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/python3 - -""" -Data source: https://www.kaggle.com/datasets/tencars/392-crypto-currency-pairs-at-minute-resolution -""" - -# %% -import os - -import numpy as np -import pandas as pd -from sqlalchemy import types - -from azure import AzureDbConnection, ConnectionSettings - -# %% - -# In terminal: -#> kaggle -v # must be >1.15 -#> mkdir data; cd data -#> kaggle datasets download tencars/392-crypto-currency-pairs-at-minute-resolution -#> unzip 392-crypto-currency-pairs-at-minute-resolution.zip - -input_dir: str = "../data" - -# Get names and number of available currency pairs -pair_names = [x[:-4] for x in os.listdir(input_dir)] -usd_pairs = [s for s in pair_names if "usd" in s] - -# Print the first 50 currency pair names -print(f"These are the first 10 out of {len(usd_pairs)} currency pairs in the dataset:") -print(usd_pairs[0:10]) - - -# %% - -def load_data(symbol: str, input_dir: str) -> pd.DataFrame: - path_name = input_dir + "/" + symbol + ".csv" - - # Load data - df = pd.read_csv(path_name, index_col='time', dtype={'open': np.float64, 'high': np.float64, 'low': np.float64, 'close': np.float64, 'volume': np.float64}) - df.index = pd.to_datetime(df.index, unit='ms') - df = df[~df.index.duplicated(keep='first')] - - # As mentioned in the description, bins without any change are not recorded. - # We have to fill these gaps by filling them with the last value until a change occurs. - #df = df.resample('1T').pad() - - return df[['open', 'high', 'low', 'close', 'volume']] - - -def calc_ohlcv_1h(df: pd.DataFrame) -> pd.DataFrame: - df['hour'] = df.index.to_period('H') - - return ( - df - .groupby(['hour']) - .agg( - { - 'open': 'first', - 'high': max, - 'low': min, - 'close': 'last', - 'volume': sum, - #'time': max - } - ) - .reset_index() - ) - - -# %% ---- -ethusd_1m = load_data("ethusd", input_dir) -ethusd_1h = calc_ohlcv_1h(ethusd_1m) - -ethusd_1h.tail() - - - -# %% ---- -conn_settings = ConnectionSettings( - '', - '', - '', - '****' -) - -db_conn = AzureDbConnection(conn_settings) -db_conn.connect() - -for t in db_conn.get_tables(): - print(t) - - -# %% -db_mapping = { - 'FIGI': types.CHAR(length=12), - 'open': types.DECIMAL(precision=19, scale=9), - 'high': types.DECIMAL(precision=19, scale=9), - 'close': types.DECIMAL(precision=19, scale=9), - 'low': types.DECIMAL(precision=19, scale=9), - 'volume': types.DECIMAL(precision=19, scale=9), - 'time': types.DATETIME(), - 'source_id': types.SMALLINT(), - 'version': types.VARCHAR(length=12), - 'interval': types.CHAR(length=2) -} - - -# %% -pd.options.mode.chained_assignment = None - -min_candels_n = 10000 - -i = 1 -for pair in usd_pairs: - print(f'INFO | {pair} > Starting read dataset...') - - candles_df = load_data(pair, input_dir) - - if len(candles_df) > min_candels_n: - - df = candles_df.loc[:'2022-10-01'] - - if len(df) > 0: - df = calc_ohlcv_1h(df) - - df['FIGI'] = pair - df['time'] = df.hour.apply(lambda h: h.to_timestamp()) - df['source_id'] = 1 - df['version'] = 'v20221001' - df['interval'] = '1H' - df.drop(columns='hour', inplace=True) - - print(f'INFO | {pair} > Starting insert to DB ({i} of {len(usd_pairs)})...') - print('DEBUG | {} rows from {} to {}'.format(df.shape[0], min(df['time']), max(df['time']))) - try: - db_conn.insert(df, 'crypto_1h', db_mapping) - except Exception as ex: - print(f'ERROR | {pair} > {ex}') - - else: - print(f'WARN | {pair} > No new records') - else: - print(f'WARN | {pair} > Only {candles_df.shape[0]} records') - i += 1 - - -# %% -db_conn.dispose()