diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1e53675 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/data/*.csv +/data/*.zip diff --git a/src/binance_open_data.ipynb b/src/binance_open_data.ipynb new file mode 100644 index 0000000..ed8b9e1 --- /dev/null +++ b/src/binance_open_data.ipynb @@ -0,0 +1,337 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Binance Open Data lab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Binance Open Data](https://github.com/binance/binance-public-data/#klines) and analyze it.\n", + "\n", + "### Stet 1. Download data \n", + "\n", + "Downloading __1-minute candles__ for `BTC/USDT` and `BTC/UDSC` using `bash` or `powershell` scripts:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "#!/bin/sh\n", + "\n", + "# create dir for data\n", + "!mkdir ../data\n", + "\n", + "# download data using GET request\n", + "!wget -N -P ../data https://data.binance.vision/data/spot/daily/klines/BTCUSDT/1m/BTCUSDT-1m-2022-06-21.zip\n", + "!wget -N -P../data https://data.binance.vision/data/spot/daily/klines/BTCUSDC/1m/BTCUSDC-1m-2022-06-21.zip\n", + "\n", + "# unzip\n", + "!unzip -o -d ../data ../data/BTCUSDT-1m-2022-06-21.zip \n", + "!unzip -o -d ../data ../data/BTCUSDC-1m-2022-06-21.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Import data to Dataframe \n", + "\n", + "Import packages for data analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import httpx\n", + "\n", + "from datetime import datetime" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import data from CSV file to Pandas DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_data(pair: str) -> pd.DataFrame:\n", + " return pd.read_csv(f'../data/{pair}-1m-2022-06-21.csv', header = None)\n", + "\n", + "btcusdt_df = get_data('BTCUSDT')\n", + "btcusdt_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set names to columns:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def set_column_names(df: pd.DataFrame) -> pd.DataFrame:\n", + " column_names_mapping = {\n", + " 0: 'Open_time',\n", + " 1: 'Open',\n", + " 2: 'High',\n", + " 3: 'Low',\n", + " 4: 'Close',\n", + " 5: 'Volume',\n", + " 6: 'Close_time',\n", + " 7: 'Quote_asset_volume',\n", + " 8: 'Number_of_trades',\n", + " 9: 'Taker_buy_base_asset_volume',\n", + " 10: 'Taker_buy_quote_asset_volume',\n", + " 11: 'Ignore'\n", + " }\n", + " return df.rename(columns=column_names_mapping)\n", + "\n", + "btcusdt_df = set_column_names(btcusdt_df)\n", + "btcusdt_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert timestamp to human-readable date and time format:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "btcusdt_df['Open_time'] = btcusdt_df.iloc[:, 0].apply(lambda t: datetime.fromtimestamp(t/1000))\n", + "btcusdt_df['Close_time'] = btcusdt_df.iloc[:, 6].apply(lambda t: datetime.fromtimestamp(t/1000))\n", + "\n", + "btcusdt_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at _Descriptive statistics_ (min, mean, max, standard deviation):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "btcusdt_df.describe(datetime_is_numeric=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Transform data\n", + "\n", + "Calculate __1-hour OHLCV__ candles:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_ohclv(df: pd.DataFrame) -> pd.DataFrame:\n", + " df['hour'] = df['Close_time'].apply(lambda t: t.hour)\n", + "\n", + " return (\n", + " df\n", + " .groupby(['hour'])\n", + " .agg(\n", + " {\n", + " 'Open': 'first',\n", + " 'High': max,\n", + " 'Low': min,\n", + " 'Close': 'last',\n", + " 'Volume': sum,\n", + " 'Close_time': max\n", + " }\n", + " )\n", + " .reset_index()\n", + " .drop(columns=['hour'])\n", + " )\n", + "\n", + "btcusdt_1h_df = calculate_ohclv(btcusdt_df)\n", + "\n", + "btcusdt_1h_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data validation is very important. Let's write domain-driven asserts:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert(\n", + " isinstance(btcusdt_1h_df, pd.DataFrame)\n", + " and btcusdt_1h_df.shape == (24, 6)\n", + " and not btcusdt_1h_df.isnull().any().any()\n", + " and btcusdt_1h_df.iloc[:, 0:5].ge(0).all().all()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Expand the dataset with information about `BTC/USDC` \n", + "\n", + "Download `BTC/USDC` 1-minute candles and transform it to 1-hour candles:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "btcusdc_df = get_data('BTCUSDC') # download data\n", + "btcusdc_df = set_column_names(btcusdc_df) # set column names\n", + "btcusdc_df['Close_time'] = btcusdc_df.iloc[:, 6].apply(lambda t: datetime.fromtimestamp(t/1000)) # convert timestamp to date+time\n", + "\n", + "btcusdc_1h_df = calculate_ohclv(btcusdc_df) # calculate 1h OHCLV candles\n", + "btcusdc_1h_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Join altogether:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "btcusdt_1h_df['pair'] = 'BTC-USDT'\n", + "btcusdc_1h_df['pair'] = 'BTC-USDC'\n", + "\n", + "# Join datasets\n", + "candles_1h_df = pd.concat([btcusdt_1h_df, btcusdc_1h_df])\n", + "\n", + "# Validate result\n", + "assert(\n", + " isinstance(candles_1h_df, pd.DataFrame)\n", + " and candles_1h_df.shape == (48, 7)\n", + " and (candles_1h_df['pair'].unique() == ['BTC-USDT', 'BTC-USDC']).all()\n", + ")\n", + "\n", + "# Sort output by Close_time\n", + "candles_1h_df.sort_values('Close_time')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Use Binance API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# %%\n", + "import pandas as pd \n", + "from binance.client import Client\n", + "\n", + "\n", + "# %%\n", + "api_key = \"****\"\n", + "secret_key = \"***\"\n", + "\n", + "client = Client(api_key, secret_key)\n", + "\n", + "\n", + "# %%\n", + "coins_response = client.get_all_coins_info()\n", + "coins_df = pd.DataFrame.from_dict(coins_response, orient='columns')\n", + "\n", + "\n", + "# %%\n", + "pairs_list = coins_df.coin.apply(lambda x: f\"{x}USDT\") \n", + "client.get_historical_klines(\n", + " 'BTCUSDT', \n", + " interval=Client.KLINE_INTERVAL_1HOUR,\n", + " start_str='2022-04-21', \n", + " end_str='2022-04-22'\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.13 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "6fd7ff10be7e3a66c1b3745c4cbc00041a2589eb74ab4be46a3698a7b56001aa" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}