from fastai.data.all import *
from fastai.vision.all import *
path=Path() # 현재 위치 저장 현재폴더=.  상위폴더=..
path
Path('.')
path.ls()
(#52) [Path('toenail.txt'),Path('2021-12-16-git.ipynb'),Path('2021-11-14-bd_2주차_1.ipynb'),Path('2021-11-01-데시_8주차_1.ipynb'),Path('2022-01-11-graph논문.ipynb'),Path('2021-10-12-데시_5주차.ipynb'),Path('2021-09-13-데시_2주차.ipynb'),Path('2021-12-23-ubuntu.ipynb'),Path('2021-11-21-bd_4주차_1.ipynb'),Path('2021-11-21-bd_4주차_2.ipynb')...]

(path/'폴더 이름 넣어~').ls()

path=Path()
(path/'asdf').mkdir()
(path/'asdf').ls()
(#0) []
(path/'asdf').mkdir() # 이미 있는 폴더면 오류 발생
---------------------------------------------------------------------------
FileExistsError                           Traceback (most recent call last)
<ipython-input-7-96a686fc4db7> in <module>
----> 1 (path/'asdf').mkdir() # 이미 있는 폴더면 오류 발생

~/anaconda3/envs/csy/lib/python3.8/pathlib.py in mkdir(self, mode, parents, exist_ok)
   1286             self._raise_closed()
   1287         try:
-> 1288             self._accessor.mkdir(self, mode)
   1289         except FileNotFoundError:
   1290             if not parents or self.parent == self:

FileExistsError: [Errno 17] File exists: 'asdf'
(path/'asdf').mkdir(exist_ok=True) # 이미 존재하면 무시~
(path/'asdf').rmdir() # 생성한 폴더 삭제

이미지 크롤링

- 이미지 크롤링

    1. 검색 2. 이미지 주소를 찾음 3. 해당 주소로 이동하여 저장하는 과정 반복

- 다른방법: 덕덕고를 이용한 이미지 크롤링

def search_images_ddg(key,max_n=200):
    """Search for 'key' with DuckDuckGo and return a unique urls of 'max_n' images
       (Adopted from https://github.com/deepanprabhu/duckduckgo-images-api)
    """
    url        = 'https://duckduckgo.com/'
    params     = {'q':key}
    res        = requests.post(url,data=params)
    searchObj  = re.search(r'vqd=([\d-]+)\&',res.text)
    if not searchObj: print('Token Parsing Failed !'); return
    requestUrl = url + 'i.js'
    headers    = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'}
    params     = (('l','us-en'),('o','json'),('q',key),('vqd',searchObj.group(1)),('f',',,,'),('p','1'),('v7exp','a'))
    urls       = []
    while True:
        try:
            res  = requests.get(requestUrl,headers=headers,params=params)
            data = json.loads(res.text)
            for obj in data['results']:
                urls.append(obj['image'])
                max_n = max_n - 1
                if max_n < 1: return L(set(urls))     # dedupe
            if 'next' not in data: return L(set(urls))
            requestUrl = url + data['next']
        except:
            pass

search_images_ddg(검색어)를 이용하여 검색어에 해당하는 url 얻기m

search_images_ddg('Holybang',max_n=5)
(#5) ['https://i.ytimg.com/vi/qI8ro1M75t0/maxresdefault.jpg','https://i.ytimg.com/vi/-Y5zKyRPaJ8/maxresdefault.jpg','https://t1.daumcdn.net/cfile/tistory/993004335A12CB082C','https://i.ytimg.com/vi/SBWy4-ZU4qQ/maxresdefault.jpg','https://i.ytimg.com/vi/w8DYFa8SrL4/maxresdefault.jpg']
path=Path()
path.ls()
(#7) [Path('bd_1주차.ipynb'),Path('2021_09_07_(1주차)_9월7일.ipynb'),Path('bd_1st'),Path('2021_09_09_(2주차)_9월9일.ipynb'),Path('.ipynb_checkpoints'),Path('2021-09-27-(3주차)_9월27일(1).ipynb'),Path('bd_2주차.ipynb')]
download_images(path,urls=search_images_ddg('Holybang',max_n=5))
  • 현재 working directory에 5개의 이미지가 저장된 모습!
keywords='sunmi', 'Hyuna' # 단어 한 개 쓰면 키워드로 입력되어서 알파벳 수대로 폴더 만들어짐..
path=Path('Singer')
if not path.exists(): # 현재폴더에 Singer 폴더가 있는지 체크 
    path.mkdir() # 현재폴더에 Singer 폴더가 만들어짐 
    for keyword in keywords: # keyword='sunmi', keyword='Hyuna' 일때 아래내용을 반복 
        lastpath=path/keyword # ./Singer/sunmi or ./Singer/Hyuna 
        lastpath.mkdir(exist_ok=True) # make ./Singer/sunmi or ./Singer/Hyuna 
        urls=search_images_ddg(keyword) # 'sunmi' 검색어로 url들의 리스트를 얻음
        download_images(lastpath,urls=urls) # 그 url에 해당하는 이미지들을  ./Singer/sunmi or ./Singer/Hyuna 에 저장

Cleaning Data

  • 탐색기로 파일들을 살펴보니 조금 이상한 확장자도 있음.

  • 조금 이상해보이는 확장자도 열리기는 함.

PILImage.create('./singer/iu/00000015.jpg:large')
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-44-2622c1a578e7> in <module>
----> 1 PILImage.create('./singer/iu/00000015.jpg:large')

~/anaconda3/envs/csy/lib/python3.8/site-packages/fastai/vision/core.py in create(cls, fn, **kwargs)
    108         if isinstance(fn,ndarray): return cls(Image.fromarray(fn))
    109         if isinstance(fn,bytes): fn = io.BytesIO(fn)
--> 110         return cls(load_image(fn, **merge(cls._open_args, kwargs)))
    111 
    112     def show(self, ctx=None, **kwargs):

~/anaconda3/envs/csy/lib/python3.8/site-packages/fastai/vision/core.py in load_image(fn, mode)
     83 def load_image(fn, mode=None):
     84     "Open and load a `PIL.Image` and convert to `mode`"
---> 85     im = Image.open(fn)
     86     im.load()
     87     im = im._new(im.im)

~/anaconda3/envs/csy/lib/python3.8/site-packages/PIL/Image.py in open(fp, mode, formats)
   2973 
   2974     if filename:
-> 2975         fp = builtins.open(filename, "rb")
   2976         exclusive_fp = True
   2977 

FileNotFoundError: [Errno 2] No such file or directory: './singer/iu/00000015.jpg:large'
verify_images(get_image_files(path))
(#4) [Path('Singer/sunmi/00000065.jpg'),Path('Singer/Hyuna/00000034.jpg'),Path('Singer/Hyuna/00000039.jpg'),Path('Singer/Hyuna/00000025.jpeg')]
  • 위에 해당하는 이미지를 수동으로 지워줌
  • 나중에 지우는 함수 배움(조금 까다로움)

- fastai 가 지원하는 함수로 분석하기 좋게 dls 만들기

dls=ImageDataLoaders.from_folder(
    path,
    train='singer',
    valid_pct=0.2,
    item_tfms=Resize(224))

ImageDataLoaders.from_folder(path, train='train', valid='valid', valid_pct=None, seed=None, vocab=None, item_tfms=None, batch_tfms=None, bs=64, val_bs=None, shuffle=True, device=None)

dls.show_batch(max_n=16)
learn=cnn_learner(dls,resnet34,metrics=error_rate)
learn.fine_tune(7)
epoch train_loss valid_loss error_rate time
0 1.169335 1.660869 0.470588 00:05
epoch train_loss valid_loss error_rate time
0 0.718052 0.950780 0.397059 00:04
1 0.634088 0.537583 0.205882 00:04
2 0.490461 0.444096 0.205882 00:04
3 0.389381 0.479742 0.161765 00:04
4 0.317441 0.513656 0.176471 00:04
5 0.269008 0.556920 0.191176 00:04
6 0.234722 0.560562 0.205882 00:04
learn.show_results(max_n=16)

오답분석

interp=Interpretation.from_learner(learn)
interp.plot_top_losses(16)
  • 수동으로 특정 observation에 대한 예측결과를 확인
dls.train_ds
(#275) [(PILImage mode=RGB size=1080x1080, TensorCategory(1)),(PILImage mode=RGB size=1038x1557, TensorCategory(1)),(PILImage mode=RGB size=642x858, TensorCategory(1)),(PILImage mode=RGB size=509x509, TensorCategory(0)),(PILImage mode=RGB size=960x1200, TensorCategory(1)),(PILImage mode=RGB size=800x1200, TensorCategory(0)),(PILImage mode=RGB size=3600x2025, TensorCategory(0)),(PILImage mode=RGB size=1100x1716, TensorCategory(1)),(PILImage mode=RGB size=1280x1920, TensorCategory(1)),(PILImage mode=RGB size=768x1024, TensorCategory(1))...]
  • training test
dls.train_ds[0] # 첫 번째 observation, 즉, (x1,y1)
(PILImage mode=RGB size=1080x1080, TensorCategory(1))
  • $x_1=$PILImage mode=RGB size=960x960
  • $y_1=$TensorCategory(1)
dls.train_ds[100][0]
  • $x_{100}$=위의 이미지
dls.train_ds[100][1]
TensorCategory(1)
  • $y_{100}=$TensorCategory(1)
x100=dls.train_ds[100][0]
learn.predict(x100)
('sunmi', TensorBase(1), TensorBase([0.0015, 0.9985]))

Test

path=Path()
if not (path/'test').exists():
    (path/'test').mkdir()
urls=search_images_ddg('sunmi 선미',max_n=20)
download_images(path/'test',urls=urls)
testset=get_image_files(path/'test')
testset
(#20) [Path('test/00000010.jpg'),Path('test/00000005.jpg'),Path('test/00000013.jpg'),Path('test/00000011.jpg'),Path('test/00000003.jpg'),Path('test/00000000.jpg'),Path('test/00000004.jpg'),Path('test/00000016.jpg'),Path('test/00000012.jpg'),Path('test/00000006.jpg')...]
for i in range(len(testset)):
    print(learn.predict(PILImage.create(testset[i])))
('Hyuna', TensorBase(0), TensorBase([0.5452, 0.4548]))
('Hyuna', TensorBase(0), TensorBase([0.5311, 0.4689]))
('Hyuna', TensorBase(0), TensorBase([0.9941, 0.0059]))
('sunmi', TensorBase(1), TensorBase([0.0239, 0.9761]))
('sunmi', TensorBase(1), TensorBase([0.2874, 0.7126]))
('sunmi', TensorBase(1), TensorBase([0.2435, 0.7565]))
('sunmi', TensorBase(1), TensorBase([4.7129e-04, 9.9953e-01]))
('sunmi', TensorBase(1), TensorBase([0.0046, 0.9954]))
('sunmi', TensorBase(1), TensorBase([0.2166, 0.7834]))
('sunmi', TensorBase(1), TensorBase([0.0633, 0.9367]))
('Hyuna', TensorBase(0), TensorBase([0.9806, 0.0194]))
('Hyuna', TensorBase(0), TensorBase([9.9916e-01, 8.4005e-04]))
('Hyuna', TensorBase(0), TensorBase([0.9590, 0.0410]))
('sunmi', TensorBase(1), TensorBase([0.0581, 0.9419]))
('sunmi', TensorBase(1), TensorBase([5.8807e-04, 9.9941e-01]))
('sunmi', TensorBase(1), TensorBase([0.1369, 0.8631]))
('sunmi', TensorBase(1), TensorBase([0.1169, 0.8831]))
('sunmi', TensorBase(1), TensorBase([0.0784, 0.9216]))
('sunmi', TensorBase(1), TensorBase([7.0567e-07, 1.0000e+00]))
('Hyuna', TensorBase(0), TensorBase([0.7406, 0.2594]))
  • 결과를 보니까 sunmi이 많음 → 어느정도 맞추는것 같긴하다
PILImage.create(testset[1])
  • 실제로 선미인데 현아로 예측한 사진
path=Path()
if not (path/'test2').exists():
        (path/'test2').mkdir()
urls=search_images_ddg('hyuna 현아',max_n=20)
download_images(path/'test2',urls=urls)
testset=get_image_files(path/'test2')
testset
(#19) [Path('test2/00000010.jpg'),Path('test2/00000000.jpeg'),Path('test2/00000005.jpg'),Path('test2/00000013.jpg'),Path('test2/00000011.jpg'),Path('test2/00000003.jpg'),Path('test2/00000018.jpeg'),Path('test2/00000004.jpg'),Path('test2/00000016.jpg'),Path('test2/00000009.jpeg')...]
for i in range(len(testset)):
    print(learn.predict(PILImage.create(testset[i])))
('Hyuna', TensorBase(0), TensorBase([1.0000e+00, 3.2445e-06]))
('sunmi', TensorBase(1), TensorBase([0.4499, 0.5501]))
('Hyuna', TensorBase(0), TensorBase([0.9945, 0.0055]))
('Hyuna', TensorBase(0), TensorBase([9.9906e-01, 9.4329e-04]))
('Hyuna', TensorBase(0), TensorBase([0.9759, 0.0241]))
('Hyuna', TensorBase(0), TensorBase([0.5947, 0.4053]))
('Hyuna', TensorBase(0), TensorBase([0.9765, 0.0235]))
('Hyuna', TensorBase(0), TensorBase([0.9819, 0.0181]))
('sunmi', TensorBase(1), TensorBase([0.3257, 0.6743]))
('Hyuna', TensorBase(0), TensorBase([0.9685, 0.0315]))
('Hyuna', TensorBase(0), TensorBase([9.9974e-01, 2.6368e-04]))
('Hyuna', TensorBase(0), TensorBase([0.9840, 0.0160]))
('Hyuna', TensorBase(0), TensorBase([9.9996e-01, 4.0536e-05]))
('sunmi', TensorBase(1), TensorBase([0.2522, 0.7478]))
('Hyuna', TensorBase(0), TensorBase([9.9949e-01, 5.0994e-04]))
('Hyuna', TensorBase(0), TensorBase([0.9084, 0.0916]))
('Hyuna', TensorBase(0), TensorBase([0.8650, 0.1350]))
('Hyuna', TensorBase(0), TensorBase([0.9987, 0.0013]))
('Hyuna', TensorBase(0), TensorBase([0.7162, 0.2838]))
  • 결과를 보니 Hyuna 역시 잘 맞추는 듯 보인다.

- 정확률이 아쉽긴 하지만 어느정도 유의미한 결과를 얻었다.

PILImage.create(testset[1]) # 현아인데 선미로 예측한 사진