第25讲：项目实战 - HR智能助手开发实现

实现HR智能助手的核心功能，包括简历解析、考勤管理和薪资计算。

一、项目结构

hr_assistant/
├── __init__.py
├── main.py              # 入口
├── config.py            # 配置
├── models/              # 数据模型
│   ├── __init__.py
│   ├── resume.py        # 简历模型
│   ├── attendance.py    # 考勤模型
│   └── employee.py      # 员工模型
├── services/            # 业务服务
│   ├── __init__.py
│   ├── resume_service.py
│   ├── attendance_service.py
│   └── payroll_service.py
├── utils/               # 工具函数
│   ├── __init__.py
│   ├── db.py
│   └── excel.py
└── templates/           # 模板
    ├── payroll_template.xlsx
    └── attendance_template.xlsx

二、核心代码实现

2.1 简历解析服务

# services/resume_service.py
from typing import Dict, List
import re

class ResumeService:
    """简历服务"""
    
    def __init__(self, ocr_service, nlp_service, db):
        self.ocr = ocr_service
        self.nlp = nlp_service
        self.db = db
    
    def parse_resume(self, file_path: str) -> Dict:
        """解析简历"""
        # 1. 提取文本
        text = self._extract_text(file_path)
        
        # 2. 解析基本信息
        basic_info = self._parse_basic_info(text)
        
        # 3. 解析教育经历
        education = self._parse_education(text)
        
        # 4. 解析工作经历
        experience = self._parse_experience(text)
        
        # 5. 提取技能
        skills = self._extract_skills(text)
        
        resume_data = {
            **basic_info,
            'education': education,
            'experience': experience,
            'skills': skills,
            'raw_text': text
        }
        
        # 6. 保存到数据库
        self.db.save_resume(resume_data)
        
        return resume_data
    
    def _extract_text(self, file_path: str) -> str:
        """从文件提取文本"""
        if file_path.endswith('.pdf'):
            return self.ocr.extract_from_pdf(file_path)
        elif file_path.endswith(('.doc', '.docx')):
            return self.ocr.extract_from_word(file_path)
        else:
            return self.ocr.recognize(file_path)
    
    def _parse_basic_info(self, text: str) -> Dict:
        """解析基本信息"""
        info = {}
        
        # 姓名（通常在开头）
        name_match = re.search(r'^([\u4e00-\u9fa5]{2,4})', text)
        info['name'] = name_match.group(1) if name_match else None
        
        # 电话
        phone_match = re.search(r'1[3-9]\d{9}', text)
        info['phone'] = phone_match.group() if phone_match else None
        
        # 邮箱
        email_match = re.search(r'[\w.-]+@[\w.-]+\.\w+', text)
        info['email'] = email_match.group() if email_match else None
        
        # 性别
        if '男' in text[:100]:
            info['gender'] = '男'
        elif '女' in text[:100]:
            info['gender'] = '女'
        else:
            info['gender'] = None
        
        # 年龄/出生年份
        age_match = re.search(r'(\d{4})年.*出生|出生.*(\d{4})', text)
        if age_match:
            birth_year = int(age_match.group(1) or age_match.group(2))
            info['age'] = 2024 - birth_year
        else:
            info['age'] = None
        
        return info
    
    def _parse_education(self, text: str) -> List[Dict]:
        """解析教育经历"""
        education = []
        
        # 匹配教育经历模式
        edu_pattern = r'(\d{4}[./]\d{1,2}[-~至](?:\d{4}[./]\d{1,2}|至今))\s*([\u4e00-\u9fa5]+大学|[\u4e00-\u9fa5]+学院).*?([\u4e00-\u9fa5]+)(?:专业|系)'
        
        for match in re.finditer(edu_pattern, text, re.DOTALL):
            education.append({
                'period': match.group(1),
                'school': match.group(2),
                'major': match.group(3)
            })
        
        return education
    
    def _parse_experience(self, text: str) -> List[Dict]:
        """解析工作经历"""
        experience = []
        
        # 匹配工作经历模式
        exp_pattern = r'(\d{4}[./]\d{1,2}[-~至](?:\d{4}[./]\d{1,2}|至今))\s*([\u4e00-\u9fa5]+公司|[^\n]+).*?([\u4e00-\u9fa5]+)'
        
        for match in re.finditer(exp_pattern, text, re.DOTALL):
            experience.append({
                'period': match.group(1),
                'company': match.group(2),
                'position': match.group(3)
            })
        
        return experience
    
    def _extract_skills(self, text: str) -> List[str]:
        """提取技能"""
        # 技能关键词列表
        skill_keywords = [
            'Python', 'Java', 'JavaScript', 'C++', 'Go', 'Rust',
            'MySQL', 'PostgreSQL', 'MongoDB', 'Redis',
            'Linux', 'Docker', 'Kubernetes', 'AWS', 'Azure',
            '机器学习', '深度学习', '数据分析', '人工智能',
            '项目管理', '团队协作', '敏捷开发'
        ]
        
        found_skills = []
        for skill in skill_keywords:
            if skill.lower() in text.lower():
                found_skills.append(skill)
        
        return found_skills
    
    def search_resumes(self, keywords: List[str], min_score: int = 60) -> List[Dict]:
        """搜索简历"""
        all_resumes = self.db.get_all_resumes()
        results = []
        
        for resume in all_resumes:
            score = self._calculate_match_score(resume, keywords)
            if score >= min_score:
                resume['matching_score'] = score
                results.append(resume)
        
        # 按匹配度排序
        results.sort(key=lambda x: x['matching_score'], reverse=True)
        return results
    
    def _calculate_match_score(self, resume: Dict, keywords: List[str]) -> int:
        """计算匹配分数"""
        score = 0
        text = resume.get('raw_text', '')
        
        for keyword in keywords:
            if keyword.lower() in text.lower():
                score += 20
        
        return min(score, 100)

2.2 考勤管理服务

# services/attendance_service.py
from typing import Dict, List
from datetime import datetime, timedelta
import pandas as pd

class AttendanceService:
    """考勤服务"""
    
    def __init__(self, db):
        self.db = db
        self.work_start_time = "09:00"
        self.work_end_time = "18:00"
    
    def import_attendance(self, file_path: str, month: str) -> Dict:
        """导入考勤数据"""
        # 读取Excel
        df = pd.read_excel(file_path)
        
        # 数据清洗
        df = self._clean_attendance_data(df)
        
        # 识别异常
        anomalies = self._detect_anomalies(df)
        
        # 保存到数据库
        self.db.save_attendance(df, month)
        
        return {
            'total_records': len(df),
            'anomalies': anomalies,
            'anomaly_count': len(anomalies)
        }
    
    def _clean_attendance_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """清洗考勤数据"""
        # 去除空行
        df = df.dropna(subset=['员工ID', '日期'])
        
        # 转换日期格式
        df['日期'] = pd.to_datetime(df['日期'])
        
        # 转换时间格式
        for col in ['上班时间', '下班时间']:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], format='%H:%M', errors='coerce')
        
        return df
    
    def _detect_anomalies(self, df: pd.DataFrame) -> List[Dict]:
        """检测异常"""
        anomalies = []
        
        for _, row in df.iterrows():
            # 检查迟到
            if pd.notna(row.get('上班时间')):
                check_in = row['上班时间'].strftime('%H:%M')
                if check_in > self.work_start_time:
                    anomalies.append({
                        'employee_id': row['员工ID'],
                        'date': row['日期'].strftime('%Y-%m-%d'),
                        'type': '迟到',
                        'detail': f"上班时间 {check_in}"
                    })
            
            # 检查早退
            if pd.notna(row.get('下班时间')):
                check_out = row['下班时间'].strftime('%H:%M')
                if check_out < self.work_end_time:
                    anomalies.append({
                        'employee_id': row['员工ID'],
                        'date': row['日期'].strftime('%Y-%m-%d'),
                        'type': '早退',
                        'detail': f"下班时间 {check_out}"
                    })
            
            # 检查缺卡
            if pd.isna(row.get('上班时间')) or pd.isna(row.get('下班时间')):
                anomalies.append({
                    'employee_id': row['员工ID'],
                    'date': row['日期'].strftime('%Y-%m-%d'),
                    'type': '缺卡',
                    'detail': '缺少打卡记录'
                })
        
        return anomalies
    
    def generate_monthly_report(self, month: str) -> str:
        """生成月度考勤报表"""
        # 获取数据
        data = self.db.get_attendance_by_month(month)
        
        # 统计
        stats = self._calculate_statistics(data)
        
        # 生成Excel
        output_path = f"考勤报表_{month}.xlsx"
        self._export_report(stats, output_path)
        
        return output_path
    
    def _calculate_statistics(self, data: pd.DataFrame) -> Dict:
        """计算统计指标"""
        stats = {}
        
        # 按员工统计
        for employee_id in data['员工ID'].unique():
            emp_data = data[data['员工ID'] == employee_id]
            
            stats[employee_id] = {
                'total_days': len(emp_data),
                'late_count': len(emp_data[emp_data['状态'] == '迟到']),
                'early_leave_count': len(emp_data[emp_data['状态'] == '早退']),
                'absent_count': len(emp_data[emp_data['状态'] == '缺勤']),
                'leave_count': len(emp_data[emp_data['状态'] == '请假']),
                'normal_days': len(emp_data[emp_data['状态'] == '正常'])
            }
        
        return stats

2.3 薪资计算服务

# services/payroll_service.py
from typing import Dict, List
from decimal import Decimal, ROUND_HALF_UP

class PayrollService:
    """薪资服务"""
    
    def __init__(self, db):
        self.db = db
        # 个税起征点
        self.tax_threshold = Decimal('5000')
        # 个税税率表
        self.tax_brackets = [
            (Decimal('0'), Decimal('3000'), Decimal('0.03'), Decimal('0')),
            (Decimal('3000'), Decimal('12000'), Decimal('0.10'), Decimal('210')),
            (Decimal('12000'), Decimal('25000'), Decimal('0.20'), Decimal('1410')),
            (Decimal('25000'), Decimal('35000'), Decimal('0.25'), Decimal('2660')),
            (Decimal('35000'), Decimal('55000'), Decimal('0.30'), Decimal('4410')),
            (Decimal('55000'), Decimal('80000'), Decimal('0.35'), Decimal('7160')),
            (Decimal('80000'), Decimal('999999999'), Decimal('0.45'), Decimal('15160'))
        ]
    
    def calculate_salary(self, employee_id: str, month: str) -> Dict:
        """计算薪资"""
        # 获取员工信息
        employee = self.db.get_employee(employee_id)
        
        # 获取考勤数据
        attendance = self.db.get_attendance(employee_id, month)
        
        # 基本工资
        base_salary = Decimal(str(employee['base_salary']))
        
        # 计算应扣款项
        deductions = self._calculate_deductions(base_salary, attendance)
        
        # 计算应发款项
        allowances = self._calculate_allowances(employee)
        
        # 计算社保公积金
        insurance = self._calculate_insurance(base_salary)
        
        # 应纳税所得额
        taxable_income = (base_salary + allowances['total'] - 
                         deductions['total'] - insurance['total'] - 
                         self.tax_threshold)
        
        # 计算个税
        tax = self._calculate_tax(max(taxable_income, Decimal('0')))
        
        # 实发工资
        net_salary = (base_salary + allowances['total'] - 
                     deductions['total'] - insurance['total'] - tax)
        
        salary_detail = {
            'employee_id': employee_id,
            'month': month,
            'base_salary': float(base_salary),
            'allowances': allowances,
            'deductions': deductions,
            'insurance': insurance,
            'tax': float(tax.quantize(Decimal('0.01'), rounding=ROUND_HALF_UP)),
            'net_salary': float(net_salary.quantize(Decimal('0.01'), rounding=ROUND_HALF_UP))
        }
        
        # 保存记录
        self.db.save_salary_record(salary_detail)
        
        return salary_detail
    
    def _calculate_tax(self, taxable_income: Decimal) -> Decimal:
        """计算个人所得税"""
        for low, high, rate, deduction in self.tax_brackets:
            if low <= taxable_income <= high:
                return taxable_income * rate - deduction
        return Decimal('0')
    
    def _calculate_insurance(self, base_salary: Decimal) -> Dict:
        """计算社保公积金"""
        # 简化的计算，实际应根据当地政策
        pension = base_salary * Decimal('0.08')  # 养老保险
        medical = base_salary * Decimal('0.02')  # 医疗保险
        unemployment = base_salary * Decimal('0.005')  # 失业保险
        housing_fund = base_salary * Decimal('0.07')  # 住房公积金
        
        total = pension + medical + unemployment + housing_fund
        
        return {
            'pension': float(pension.quantize(Decimal('0.01'))),
            'medical': float(medical.quantize(Decimal('0.01'))),
            'unemployment': float(unemployment.quantize(Decimal('0.01'))),
            'housing_fund': float(housing_fund.quantize(Decimal('0.01'))),
            'total': float(total.quantize(Decimal('0.01')))
        }
    
    def generate_payslip(self, employee_id: str, month: str) -> str:
        """生成工资条"""
        salary = self.db.get_salary_record(employee_id, month)
        employee = self.db.get_employee(employee_id)
        
        # 生成Excel工资条
        output_path = f"工资条_{employee['name']}_{month}.xlsx"
        # ... Excel生成代码
        
        return output_path

三、Skill 主类

# main.py
class HRAssistantSkill:
    """HR智能助手 Skill"""
    
    def __init__(self):
        self.db = Database()
        self.resume_service = ResumeService(OCRService(), NLPService(), self.db)
        self.attendance_service = AttendanceService(self.db)
        self.payroll_service = PayrollService(self.db)
    
    def handle_message(self, message: str, context: Dict) -> str:
        """处理用户消息"""
        intent = self._classify_intent(message)
        
        if intent == 'parse_resume':
            return "请上传简历文件，我将为您解析简历信息。"
        
        elif intent == 'import_attendance':
            return "请上传考勤数据Excel文件。"
        
        elif intent == 'calculate_salary':
            return self._handle_salary_calculation(message)
        
        elif intent == 'search_resume':
            return self._handle_resume_search(message)
        
        else:
            return self._handle_qa(message)
    
    def handle_file(self, file_path: str, context: Dict) -> str:
        """处理文件上传"""
        intent = context.get('last_intent')
        
        if intent == 'parse_resume':
            result = self.resume_service.parse_resume(file_path)
            return self._format_resume_result(result)
        
        elif intent == 'import_attendance':
            month = context.get('month', datetime.now().strftime('%Y-%m'))
            result = self.attendance_service.import_attendance(file_path, month)
            return f"""
考勤数据导入完成！
- 总记录数：{result['total_records']}
- 异常记录：{result['anomaly_count']} 条
            """.strip()
        
        return "文件已收到，请告诉我您想做什么？"
    
    def _classify_intent(self, message: str) -> str:
        """意图分类"""
        keywords = {
            'parse_resume': ['解析简历', '识别简历', '简历解析'],
            'import_attendance': ['导入考勤', '上传考勤', '考勤数据'],
            'calculate_salary': ['计算薪资', '算工资', '薪资计算'],
            'search_resume': ['搜索简历', '查找候选人', '人才搜索']
        }
        
        for intent, words in keywords.items():
            if any(word in message for word in words):
                return intent
        
        return 'qa'
    
    def _format_resume_result(self, result: Dict) -> str:
        """格式化简历解析结果"""
        return f"""
简历解析完成！

基本信息：
- 姓名：{result.get('name')}
- 电话：{result.get('phone')}
- 邮箱：{result.get('email')}
- 性别：{result.get('gender')}
- 年龄：{result.get('age')}

教育经历：
{self._format_education(result.get('education', []))}

工作经历：
{self._format_experience(result.get('experience', []))}

技能：{', '.join(result.get('skills', []))}
        """.strip()