Biance data crawlers (beta)

2024-11-22 02:12:19 +00:00 · 2022-06-26 13:18:10 +00:00 · 2022-06-26 13:18:10 +00:00 · 5451898a9d
commit 5451898a9d
parent c47f8bf8c3
3 changed files with 328 additions and 2176 deletions
--- a/src/binance_api_client.py
+++ b/src/binance_api_client.py
@ -0,0 +1,29 @@
 # %%
 import numpy as np
 import pandas as pd 
 import time
 from binance.client import Client
 # %%
 api_key = "****"
 secret_key = "***"
 client = Client(api_key, secret_key)
 # %%
 coins_response = client.get_all_coins_info()
 coins_df = pd.DataFrame.from_dict(coins_response, orient='columns')
 # %%
 pairs_list = coins_df.coin.apply(lambda x: f"{x}USDT") 
 client.get_historical_klines(
    'BTCUSDT', 
    interval=Client.KLINE_INTERVAL_1HOUR,
    start_str='2022-04-21', 
    end_str='2022-04-22'
 )
--- a/src/binance_open_data.ipynb
+++ b/src/binance_open_data.ipynb
@ -0,0 +1,299 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Binance Open Data lab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[Binance Open Data](https://github.com/binance/binance-public-data/#klines) and analyze it.\n",
    "\n",
    "### Stet 1. Download data \n",
    "\n",
    "Downloading __1-minute candles__ for `BTC/USDT` and `BTC/UDSC` using `bash` or `powershell` scripts:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "shellscript"
    }
   },
   "outputs": [],
   "source": [
    "#!/bin/sh\n",
    "\n",
    "# create dir for data\n",
    "!mkdir ../data\n",
    "\n",
    "# download data using GET request\n",
    "!wget -N -P ../data https://data.binance.vision/data/spot/daily/klines/BTCUSDT/1m/BTCUSDT-1m-2022-06-21.zip\n",
    "!wget -N -P../data https://data.binance.vision/data/spot/daily/klines/BTCUSDC/1m/BTCUSDC-1m-2022-06-21.zip\n",
    "\n",
    "# unzip\n",
    "!unzip -o -d ../data ../data/BTCUSDT-1m-2022-06-21.zip \n",
    "!unzip -o -d ../data ../data/BTCUSDC-1m-2022-06-21.zip"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2: Import data to Dataframe \n",
    "\n",
    "Import packages for data analysis:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import httpx\n",
    "\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import data from CSV file to Pandas DataFrame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data(pair: str) -> pd.DataFrame:\n",
    "    return pd.read_csv(f'../data/{pair}-1m-2022-06-21.csv', header = None)\n",
    "\n",
    "btcusdt_df = get_data('BTCUSDT')\n",
    "btcusdt_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set names to columns:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def set_column_names(df: pd.DataFrame) -> pd.DataFrame:\n",
    "    column_names_mapping = {\n",
    "        0: 'Open_time',\n",
    "        1: 'Open',\n",
    "        2: 'High',\n",
    "        3: 'Low',\n",
    "        4: 'Close',\n",
    "        5: 'Volume',\n",
    "        6: 'Close_time',\n",
    "        7: 'Quote_asset_volume',\n",
    "        8: 'Number_of_trades',\n",
    "        9: 'Taker_buy_base_asset_volume',\n",
    "        10: 'Taker_buy_quote_asset_volume',\n",
    "        11: 'Ignore'\n",
    "        }\n",
    "    return df.rename(columns=column_names_mapping)\n",
    "\n",
    "btcusdt_df = set_column_names(btcusdt_df)\n",
    "btcusdt_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert timestamp to human-readable date and time format:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "btcusdt_df['Open_time'] = btcusdt_df.iloc[:, 0].apply(lambda t: datetime.fromtimestamp(t/1000))\n",
    "btcusdt_df['Close_time'] = btcusdt_df.iloc[:, 6].apply(lambda t: datetime.fromtimestamp(t/1000))\n",
    "\n",
    "btcusdt_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's take a look at _Descriptive statistics_ (min, mean, max, standard deviation):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "btcusdt_df.describe(datetime_is_numeric=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2: Transform data\n",
    "\n",
    "Calculate __1-hour OHLCV__ candles:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_ohclv(df: pd.DataFrame) -> pd.DataFrame:\n",
    "    df['hour'] = df['Close_time'].apply(lambda t: t.hour)\n",
    "\n",
    "    return (\n",
    "        df\n",
    "            .groupby(['hour'])\n",
    "            .agg(\n",
    "                {\n",
    "                    'Open': 'first',\n",
    "                    'High': max,\n",
    "                    'Low': min,\n",
    "                    'Close': 'last',\n",
    "                    'Volume': sum,\n",
    "                    'Close_time': max\n",
    "                }\n",
    "            )\n",
    "            .reset_index()\n",
    "            .drop(columns=['hour'])\n",
    "        )\n",
    "\n",
    "btcusdt_1h_df = calculate_ohclv(btcusdt_df)\n",
    "\n",
    "btcusdt_1h_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data validation is very important. Let's write domain-driven asserts:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert(\n",
    "    isinstance(btcusdt_1h_df, pd.DataFrame)\n",
    "    and btcusdt_1h_df.shape == (24, 6)\n",
    "    and not btcusdt_1h_df.isnull().any().any()\n",
    "    and btcusdt_1h_df.iloc[:, 0:5].ge(0).all().all()\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3: Expand the dataset with information about `BTC/USDC` \n",
    "\n",
    "Download `BTC/USDC` 1-minute candles and transform it to 1-hour candles:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "btcusdc_df = get_data('BTCUSDC')  # download data\n",
    "btcusdc_df = set_column_names(btcusdc_df)  # set column names\n",
    "btcusdc_df['Close_time'] = btcusdc_df.iloc[:, 6].apply(lambda t: datetime.fromtimestamp(t/1000))  # convert timestamp to date+time\n",
    "\n",
    "btcusdc_1h_df = calculate_ohclv(btcusdc_df)  # calculate 1h OHCLV candles\n",
    "btcusdc_1h_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Join altogether:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "btcusdt_1h_df['pair'] = 'BTC-USDT'\n",
    "btcusdc_1h_df['pair'] = 'BTC-USDC'\n",
    "\n",
    "# Join datasets\n",
    "candles_1h_df = pd.concat([btcusdt_1h_df, btcusdc_1h_df])\n",
    "\n",
    "# Validate result\n",
    "assert(\n",
    "    isinstance(candles_1h_df, pd.DataFrame)\n",
    "    and candles_1h_df.shape == (48, 7)\n",
    "    and (candles_1h_df['pair'].unique() == ['BTC-USDT', 'BTC-USDC']).all()\n",
    ")\n",
    "\n",
    "# Sort output by Close_time\n",
    "candles_1h_df.sort_values('Close_time')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.13 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "6fd7ff10be7e3a66c1b3745c4cbc00041a2589eb74ab4be46a3698a7b56001aa"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/src/candidate_tests.ipynb
+++ b/src/candidate_tests.ipynb