Python Data Processing Reference (Part 2)

Folder handling

import os
import shutil
DIRECTORY = './data' # example directory
os.path.exists(DIRECTORY) # check if directory exists
os.mkdir(DIRECTORY) # create directory
os.rmdir(DIRECTORY) # delete directory
shutil.rmtree(DIRECTORY) # recursively delete directory
os.stat(DIRECTORY).st_size # size of directory in bytes
os.listdir(DIRECTORY) # list all files (including hidden)
[f for f in os.listdir(DIRECTORY) if not f.startswith('.')]
# list all files excluding hidden files and folders

File handling

import os
DIRECTORY = './data' # example directory
FILENAME = 'dataset.tar.gz' # example file name
FILEPATH = os.path.join(DIRECTORY, FILENAME)
os.path.isfile(FILEPATH) # check if file exists
with open(FILEPATH, mode='w') as f: # open file in write (w) mode
f.write('hello' + os.linesep + 'world')
with open(FILEPATH) as f: # open file in read (r) mode (default)
f.read()
os.remove(FILEPATH) # delete file
os.stat(FILEPATH).st_size # size of file in bytes

with … as … ?!

# create new file
f = open('test.log', 'w')
f.write('hello' + os.linesep + 'world')
f.close()
# print its content
f = open('test.log')
print(f.read())
f.close()
# create new file
with open('test.log', 'w') as f:
f.write('hello' + os.linesep + 'world')
# print its content
with open('test.log') as f:
print(f.read())

Downloading files

import os
from urllib.request import urlretrieve
SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
WORK_DIRECTORY = './data/mnist-data'
FILE_NAME = 'train-labels-idx1-ubyte.gz'
localpath = os.path.join(WORK_DIRECTORY, FILE_NAME)
remotepath = SOURCE_URL + FILE_NAME
os.makedirs(WORK_DIRECTORY) # the destination directory must exist
localpath, _ = urlretrieve(remotepath, localpath) # download file
with open(localpath, 'rb') as f:
f.read() # read file in mode 'rb' b/c it's a binary file
import os
from urllib.request import urlopen
SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
WORK_DIRECTORY = './data/mnist-data'
FILE_NAME = 'train-labels-idx1-ubyte.gz'
remotepath = SOURCE_URL + FILE_NAME
f = urlopen(remotepath) # urlopen returns a file like object
data = f.read() # read file
localpath = os.path.join(WORK_DIRECTORY, FILE_NAME)
with open(localpath, 'wb') as f:
f.write(data)

Compressed and archive files

import tarfile
with tarfile.open('/path/to/tarball.tar') as tar:
tar.extractall('/new/path/for/extracted/archive')
import tarfile
files = ['file1', 'file2', 'file3']
with tarfile.open('/path/to/tarball.tar', 'w') as tar:
for file in files:
tar.add(file)
import os
import gzip
DIRECTORY = './data' # example directory
FILENAME = 'dataset.tar.gz' # example file name
FILEPATH = os.path.join(DIRECTORY, FILENAME)
with gzip.open(FILEPATH) as f:
print(f.read(4)) # print the first 4 bytes
print(f.read(4)) # print the second 4 bytes

Dealing with binary data

# continue example from previous section
with gzip.open(FILEPATH) as f:
f.read(8) # skip first 8 bytes of metadata
for i in range(100): # inspect first 100 labels
label = struct.unpack('B', f.read(1))
print(label[0], end=' ')

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store